howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26 27from howard.functions.commons import * 28from howard.objects.database import * 29from howard.functions.databases import * 30from howard.functions.utils import * 31 32 33class Variants: 34 35 def __init__( 36 self, 37 conn=None, 38 input: str = None, 39 output: str = None, 40 config: dict = {}, 41 param: dict = {}, 42 load: bool = False, 43 ) -> None: 44 """ 45 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 46 header 47 48 :param conn: the connection to the database 49 :param input: the input file 50 :param output: the output file 51 :param config: a dictionary containing the configuration of the model 52 :param param: a dictionary containing the parameters of the model 53 """ 54 55 # Init variables 56 self.init_variables() 57 58 # Input 59 self.set_input(input) 60 61 # Config 62 self.set_config(config) 63 64 # Param 65 self.set_param(param) 66 67 # Output 68 self.set_output(output) 69 70 # connexion 71 self.set_connexion(conn) 72 73 # Header 74 self.set_header() 75 76 # Samples 77 self.set_samples() 78 79 # Load data 80 if load: 81 self.load_data() 82 83 def set_samples(self, samples: list = None) -> list: 84 """ 85 The function `set_samples` sets the samples attribute of an object to a provided list or 86 retrieves it from a parameter dictionary. 87 88 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 89 input and sets the `samples` attribute of the class to the provided list. If no samples are 90 provided, it tries to get the samples from the class's parameters using the `get_param` method 91 :type samples: list 92 :return: The `samples` list is being returned. 93 """ 94 95 if not samples: 96 samples = self.get_param().get("samples", {}).get("list", None) 97 98 self.samples = samples 99 100 return samples 101 102 def get_samples(self) -> list: 103 """ 104 This function returns a list of samples. 105 :return: The `get_samples` method is returning the `samples` attribute of the object. 106 """ 107 108 return self.samples 109 110 def get_samples_check(self) -> bool: 111 """ 112 This function returns the value of the "check" key within the "samples" dictionary retrieved 113 from the parameters. 114 :return: The method `get_samples_check` is returning the value of the key "check" inside the 115 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 116 method. If the key "check" is not found, it will return `False`. 117 """ 118 119 return self.get_param().get("samples", {}).get("check", True) 120 121 def set_input(self, input: str = None) -> None: 122 """ 123 The function `set_input` takes a file name as input, extracts the name and extension, and sets 124 attributes in the class accordingly. 125 126 :param input: The `set_input` method in the provided code snippet is used to set attributes 127 related to the input file. Here's a breakdown of the parameters and their usage in the method: 128 :type input: str 129 """ 130 131 if input and not isinstance(input, str): 132 try: 133 self.input = input.name 134 except: 135 log.error(f"Input file '{input} in bad format") 136 raise ValueError(f"Input file '{input} in bad format") 137 else: 138 self.input = input 139 140 # Input format 141 if input: 142 input_name, input_extension = os.path.splitext(self.input) 143 self.input_name = input_name 144 self.input_extension = input_extension 145 self.input_format = self.input_extension.replace(".", "") 146 147 def set_config(self, config: dict) -> None: 148 """ 149 The set_config function takes a config object and assigns it as the configuration object for the 150 class. 151 152 :param config: The `config` parameter in the `set_config` function is a dictionary object that 153 contains configuration settings for the class. When you call the `set_config` function with a 154 dictionary object as the argument, it will set that dictionary as the configuration object for 155 the class 156 :type config: dict 157 """ 158 159 self.config = config 160 161 def set_param(self, param: dict) -> None: 162 """ 163 This function sets a parameter object for the class based on the input dictionary. 164 165 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 166 as the `param` attribute of the class instance 167 :type param: dict 168 """ 169 170 self.param = param 171 172 def init_variables(self) -> None: 173 """ 174 This function initializes the variables that will be used in the rest of the class 175 """ 176 177 self.prefix = "howard" 178 self.table_variants = "variants" 179 self.dataframe = None 180 181 self.comparison_map = { 182 "gt": ">", 183 "gte": ">=", 184 "lt": "<", 185 "lte": "<=", 186 "equals": "=", 187 "contains": "SIMILAR TO", 188 } 189 190 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 191 192 self.code_type_map_to_sql = { 193 "Integer": "INTEGER", 194 "String": "VARCHAR", 195 "Float": "FLOAT", 196 "Flag": "VARCHAR", 197 } 198 199 self.index_additionnal_fields = [] 200 201 def get_indexing(self) -> bool: 202 """ 203 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 204 returns False. 205 :return: The value of the indexing parameter. 206 """ 207 208 return self.get_param().get("indexing", False) 209 210 def get_connexion_config(self) -> dict: 211 """ 212 The function `get_connexion_config` returns a dictionary containing the configuration for a 213 connection, including the number of threads and memory limit. 214 :return: a dictionary containing the configuration for the Connexion library. 215 """ 216 217 # config 218 config = self.get_config() 219 220 # Connexion config 221 connexion_config = {} 222 threads = self.get_threads() 223 224 # Threads 225 if threads: 226 connexion_config["threads"] = threads 227 228 # Memory 229 # if config.get("memory", None): 230 # connexion_config["memory_limit"] = config.get("memory") 231 if self.get_memory(): 232 connexion_config["memory_limit"] = self.get_memory() 233 234 # Temporary directory 235 if config.get("tmp", None): 236 connexion_config["temp_directory"] = config.get("tmp") 237 238 # Access 239 if config.get("access", None): 240 access = config.get("access") 241 if access in ["RO"]: 242 access = "READ_ONLY" 243 elif access in ["RW"]: 244 access = "READ_WRITE" 245 connexion_db = self.get_connexion_db() 246 if connexion_db in ":memory:": 247 access = "READ_WRITE" 248 connexion_config["access_mode"] = access 249 250 return connexion_config 251 252 def get_duckdb_settings(self) -> dict: 253 """ 254 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 255 string. 256 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 257 """ 258 259 # config 260 config = self.get_config() 261 262 # duckdb settings 263 duckdb_settings_dict = {} 264 if config.get("duckdb_settings", None): 265 duckdb_settings = config.get("duckdb_settings") 266 duckdb_settings = full_path(duckdb_settings) 267 # duckdb setting is a file 268 if os.path.exists(duckdb_settings): 269 with open(duckdb_settings) as json_file: 270 duckdb_settings_dict = yaml.safe_load(json_file) 271 # duckdb settings is a string 272 else: 273 duckdb_settings_dict = json.loads(duckdb_settings) 274 275 return duckdb_settings_dict 276 277 def set_connexion_db(self) -> str: 278 """ 279 The function `set_connexion_db` returns the appropriate database connection string based on the 280 input format and connection type. 281 :return: the value of the variable `connexion_db`. 282 """ 283 284 # Default connexion db 285 default_connexion_db = ":memory:" 286 287 # Find connexion db 288 if self.get_input_format() in ["db", "duckdb"]: 289 connexion_db = self.get_input() 290 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 291 connexion_db = default_connexion_db 292 elif self.get_connexion_type() in ["tmpfile"]: 293 tmp_name = tempfile.mkdtemp( 294 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 295 ) 296 connexion_db = f"{tmp_name}/tmp.db" 297 elif self.get_connexion_type() != "": 298 connexion_db = self.get_connexion_type() 299 else: 300 connexion_db = default_connexion_db 301 302 # Set connexion db 303 self.connexion_db = connexion_db 304 305 return connexion_db 306 307 def set_connexion(self, conn) -> None: 308 """ 309 The function `set_connexion` creates a connection to a database, with options for different 310 database formats and settings. 311 312 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 313 database. If a connection is not provided, a new connection to an in-memory database is created. 314 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 315 sqlite 316 """ 317 318 # Connexion db 319 connexion_db = self.set_connexion_db() 320 321 # Connexion config 322 connexion_config = self.get_connexion_config() 323 324 # Connexion format 325 connexion_format = self.get_config().get("connexion_format", "duckdb") 326 # Set connexion format 327 self.connexion_format = connexion_format 328 329 # Connexion 330 if not conn: 331 if connexion_format in ["duckdb"]: 332 conn = duckdb.connect(connexion_db, config=connexion_config) 333 # duckDB settings 334 duckdb_settings = self.get_duckdb_settings() 335 if duckdb_settings: 336 for setting in duckdb_settings: 337 setting_value = duckdb_settings.get(setting) 338 if isinstance(setting_value, str): 339 setting_value = f"'{setting_value}'" 340 conn.execute(f"PRAGMA {setting}={setting_value};") 341 elif connexion_format in ["sqlite"]: 342 conn = sqlite3.connect(connexion_db) 343 344 # Set connexion 345 self.conn = conn 346 347 # Log 348 log.debug(f"connexion_format: {connexion_format}") 349 log.debug(f"connexion_db: {connexion_db}") 350 log.debug(f"connexion config: {connexion_config}") 351 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 352 353 def set_output(self, output: str = None) -> None: 354 """ 355 The `set_output` function in Python sets the output file based on the input or a specified key 356 in the config file, extracting the output name, extension, and format. 357 358 :param output: The `output` parameter in the `set_output` method is used to specify the name of 359 the output file. If the config file has an 'output' key, the method sets the output to the value 360 of that key. If no output is provided, it sets the output to `None` 361 :type output: str 362 """ 363 364 if output and not isinstance(output, str): 365 self.output = output.name 366 else: 367 self.output = output 368 369 # Output format 370 if self.output: 371 output_name, output_extension = os.path.splitext(self.output) 372 self.output_name = output_name 373 self.output_extension = output_extension 374 self.output_format = self.output_extension.replace(".", "") 375 else: 376 self.output_name = None 377 self.output_extension = None 378 self.output_format = None 379 380 def set_header(self) -> None: 381 """ 382 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 383 """ 384 385 input_file = self.get_input() 386 default_header_list = [ 387 "##fileformat=VCFv4.2", 388 "#CHROM POS ID REF ALT QUAL FILTER INFO", 389 ] 390 391 # Full path 392 input_file = full_path(input_file) 393 394 if input_file: 395 396 input_format = self.get_input_format() 397 input_compressed = self.get_input_compressed() 398 config = self.get_config() 399 header_list = default_header_list 400 if input_format in [ 401 "vcf", 402 "hdr", 403 "tsv", 404 "csv", 405 "psv", 406 "parquet", 407 "db", 408 "duckdb", 409 ]: 410 # header provided in param 411 if config.get("header_file", None): 412 with open(config.get("header_file"), "rt") as f: 413 header_list = self.read_vcf_header(f) 414 # within a vcf file format (header within input file itsself) 415 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 416 # within a compressed vcf file format (.vcf.gz) 417 if input_compressed: 418 with bgzf.open(input_file, "rt") as f: 419 header_list = self.read_vcf_header(f) 420 # within an uncompressed vcf file format (.vcf) 421 else: 422 with open(input_file, "rt") as f: 423 header_list = self.read_vcf_header(f) 424 # header provided in default external file .hdr 425 elif os.path.exists((input_file + ".hdr")): 426 with open(input_file + ".hdr", "rt") as f: 427 header_list = self.read_vcf_header(f) 428 else: 429 try: # Try to get header info fields and file columns 430 431 with tempfile.TemporaryDirectory() as tmpdir: 432 433 # Create database 434 db_for_header = Database(database=input_file) 435 436 # Get header columns for infos fields 437 db_header_from_columns = ( 438 db_for_header.get_header_from_columns() 439 ) 440 441 # Get real columns in the file 442 db_header_columns = db_for_header.get_columns() 443 444 # Write header file 445 header_file_tmp = os.path.join(tmpdir, "header") 446 f = open(header_file_tmp, "w") 447 vcf.Writer(f, db_header_from_columns) 448 f.close() 449 450 # Replace #CHROM line with rel columns 451 header_list = db_for_header.read_header_file( 452 header_file=header_file_tmp 453 ) 454 header_list[-1] = "\t".join(db_header_columns) 455 456 except: 457 458 log.warning( 459 f"No header for file {input_file}. Set as default VCF header" 460 ) 461 header_list = default_header_list 462 463 else: # try for unknown format ? 464 465 log.error(f"Input file format '{input_format}' not available") 466 raise ValueError(f"Input file format '{input_format}' not available") 467 468 if not header_list: 469 header_list = default_header_list 470 471 # header as list 472 self.header_list = header_list 473 474 # header as VCF object 475 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 476 477 else: 478 479 self.header_list = None 480 self.header_vcf = None 481 482 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 483 """ 484 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 485 DataFrame based on the connection format. 486 487 :param query: The `query` parameter in the `get_query_to_df` function is a string that 488 represents the SQL query you want to execute. This query will be used to fetch data from a 489 database and convert it into a pandas DataFrame 490 :type query: str 491 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 492 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 493 function will only fetch up to that number of rows from the database query result. If no limit 494 is specified, 495 :type limit: int 496 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 497 """ 498 499 # Connexion format 500 connexion_format = self.get_connexion_format() 501 502 # Limit in query 503 if limit: 504 pd.set_option("display.max_rows", limit) 505 if connexion_format in ["duckdb"]: 506 df = ( 507 self.conn.execute(query) 508 .fetch_record_batch(limit) 509 .read_next_batch() 510 .to_pandas() 511 ) 512 elif connexion_format in ["sqlite"]: 513 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 514 515 # Full query 516 else: 517 if connexion_format in ["duckdb"]: 518 df = self.conn.execute(query).df() 519 elif connexion_format in ["sqlite"]: 520 df = pd.read_sql_query(query, self.conn) 521 522 return df 523 524 def get_overview(self) -> None: 525 """ 526 The function prints the input, output, config, and dataframe of the current object 527 """ 528 table_variants_from = self.get_table_variants(clause="from") 529 sql_columns = self.get_header_columns_as_sql() 530 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 531 df = self.get_query_to_df(sql_query_export) 532 log.info( 533 "Input: " 534 + str(self.get_input()) 535 + " [" 536 + str(str(self.get_input_format())) 537 + "]" 538 ) 539 log.info( 540 "Output: " 541 + str(self.get_output()) 542 + " [" 543 + str(str(self.get_output_format())) 544 + "]" 545 ) 546 log.info("Config: ") 547 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 548 "\n" 549 ): 550 log.info("\t" + str(d)) 551 log.info("Param: ") 552 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 553 "\n" 554 ): 555 log.info("\t" + str(d)) 556 log.info("Sample list: " + str(self.get_header_sample_list())) 557 log.info("Dataframe: ") 558 for d in str(df).split("\n"): 559 log.info("\t" + str(d)) 560 561 # garbage collector 562 del df 563 gc.collect() 564 565 return None 566 567 def get_stats(self) -> dict: 568 """ 569 The `get_stats` function calculates and returns various statistics of the current object, 570 including information about the input file, variants, samples, header fields, quality, and 571 SNVs/InDels. 572 :return: a dictionary containing various statistics of the current object. The dictionary has 573 the following structure: 574 """ 575 576 # Log 577 log.info(f"Stats Calculation...") 578 579 # table varaints 580 table_variants_from = self.get_table_variants() 581 582 # stats dict 583 stats = {"Infos": {}} 584 585 ### File 586 input_file = self.get_input() 587 stats["Infos"]["Input file"] = input_file 588 589 # Header 590 header_infos = self.get_header().infos 591 header_formats = self.get_header().formats 592 header_infos_list = list(header_infos) 593 header_formats_list = list(header_formats) 594 595 ### Variants 596 597 stats["Variants"] = {} 598 599 # Variants by chr 600 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 601 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 602 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 603 by=["CHROM"], kind="quicksort" 604 ) 605 606 # Total number of variants 607 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 608 609 # Calculate percentage 610 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 611 lambda x: (x / nb_of_variants) 612 ) 613 614 stats["Variants"]["Number of variants by chromosome"] = ( 615 nb_of_variants_by_chrom.to_dict(orient="index") 616 ) 617 618 stats["Infos"]["Number of variants"] = int(nb_of_variants) 619 620 ### Samples 621 622 # Init 623 samples = {} 624 nb_of_samples = 0 625 626 # Check Samples 627 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 628 log.debug(f"Check samples...") 629 for sample in self.get_header_sample_list(): 630 sql_query_samples = f""" 631 SELECT '{sample}' as sample, 632 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 633 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 634 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 635 FROM {table_variants_from} 636 WHERE ( 637 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 638 AND 639 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 640 ) 641 GROUP BY genotype 642 """ 643 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 644 sample_genotype_count = sql_query_genotype_df["count"].sum() 645 if len(sql_query_genotype_df): 646 nb_of_samples += 1 647 samples[f"{sample} - {sample_genotype_count} variants"] = ( 648 sql_query_genotype_df.to_dict(orient="index") 649 ) 650 651 stats["Samples"] = samples 652 stats["Infos"]["Number of samples"] = nb_of_samples 653 654 # # 655 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 656 # stats["Infos"]["Number of samples"] = nb_of_samples 657 # elif nb_of_samples: 658 # stats["Infos"]["Number of samples"] = "not a VCF format" 659 660 ### INFO and FORMAT fields 661 header_types_df = {} 662 header_types_list = { 663 "List of INFO fields": header_infos, 664 "List of FORMAT fields": header_formats, 665 } 666 i = 0 667 for header_type in header_types_list: 668 669 header_type_infos = header_types_list.get(header_type) 670 header_infos_dict = {} 671 672 for info in header_type_infos: 673 674 i += 1 675 header_infos_dict[i] = {} 676 677 # ID 678 header_infos_dict[i]["id"] = info 679 680 # num 681 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 682 if header_type_infos[info].num in genotype_map.keys(): 683 header_infos_dict[i]["Number"] = genotype_map.get( 684 header_type_infos[info].num 685 ) 686 else: 687 header_infos_dict[i]["Number"] = header_type_infos[info].num 688 689 # type 690 if header_type_infos[info].type: 691 header_infos_dict[i]["Type"] = header_type_infos[info].type 692 else: 693 header_infos_dict[i]["Type"] = "." 694 695 # desc 696 if header_type_infos[info].desc != None: 697 header_infos_dict[i]["Description"] = header_type_infos[info].desc 698 else: 699 header_infos_dict[i]["Description"] = "" 700 701 if len(header_infos_dict): 702 header_types_df[header_type] = pd.DataFrame.from_dict( 703 header_infos_dict, orient="index" 704 ).to_dict(orient="index") 705 706 # Stats 707 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 708 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 709 stats["Header"] = header_types_df 710 711 ### QUAL 712 if "QUAL" in self.get_header_columns(): 713 sql_query_qual = f""" 714 SELECT 715 avg(CAST(QUAL AS INTEGER)) AS Average, 716 min(CAST(QUAL AS INTEGER)) AS Minimum, 717 max(CAST(QUAL AS INTEGER)) AS Maximum, 718 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 719 median(CAST(QUAL AS INTEGER)) AS Median, 720 variance(CAST(QUAL AS INTEGER)) AS Variance 721 FROM {table_variants_from} 722 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 723 """ 724 725 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 726 stats["Quality"] = {"Stats": qual} 727 728 ### SNV and InDel 729 730 sql_query_snv = f""" 731 732 SELECT Type, count FROM ( 733 734 SELECT 735 'Total' AS Type, 736 count(*) AS count 737 FROM {table_variants_from} 738 739 UNION 740 741 SELECT 742 'MNV' AS Type, 743 count(*) AS count 744 FROM {table_variants_from} 745 WHERE len(REF) > 1 AND len(ALT) > 1 746 AND len(REF) = len(ALT) 747 748 UNION 749 750 SELECT 751 'InDel' AS Type, 752 count(*) AS count 753 FROM {table_variants_from} 754 WHERE len(REF) > 1 OR len(ALT) > 1 755 AND len(REF) != len(ALT) 756 757 UNION 758 759 SELECT 760 'SNV' AS Type, 761 count(*) AS count 762 FROM {table_variants_from} 763 WHERE len(REF) = 1 AND len(ALT) = 1 764 765 ) 766 767 ORDER BY count DESC 768 769 """ 770 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 771 772 sql_query_snv_substitution = f""" 773 SELECT 774 concat(REF, '>', ALT) AS 'Substitution', 775 count(*) AS count 776 FROM {table_variants_from} 777 WHERE len(REF) = 1 AND len(ALT) = 1 778 GROUP BY REF, ALT 779 ORDER BY count(*) DESC 780 """ 781 snv_substitution = ( 782 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 783 ) 784 stats["Variants"]["Counts"] = snv_indel 785 stats["Variants"]["Substitutions"] = snv_substitution 786 787 return stats 788 789 def stats_to_file(self, file: str = None) -> str: 790 """ 791 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 792 into a JSON object, and writes the JSON object to the specified file. 793 794 :param file: The `file` parameter is a string that represents the file path where the JSON data 795 will be written 796 :type file: str 797 :return: the name of the file that was written to. 798 """ 799 800 # Get stats 801 stats = self.get_stats() 802 803 # Serializing json 804 json_object = json.dumps(stats, indent=4) 805 806 # Writing to sample.json 807 with open(file, "w") as outfile: 808 outfile.write(json_object) 809 810 return file 811 812 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 813 """ 814 The `print_stats` function generates a markdown file and prints the statistics contained in a 815 JSON file in a formatted manner. 816 817 :param output_file: The `output_file` parameter is a string that specifies the path and filename 818 of the output file where the stats will be printed in Markdown format. If no `output_file` is 819 provided, a temporary directory will be created and the stats will be saved in a file named 820 "stats.md" within that 821 :type output_file: str 822 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 823 file where the statistics will be saved. If no value is provided, a temporary directory will be 824 created and a default file name "stats.json" will be used 825 :type json_file: str 826 :return: The function `print_stats` does not return any value. It has a return type annotation 827 of `None`. 828 """ 829 830 # Full path 831 output_file = full_path(output_file) 832 json_file = full_path(json_file) 833 834 with tempfile.TemporaryDirectory() as tmpdir: 835 836 # Files 837 if not output_file: 838 output_file = os.path.join(tmpdir, "stats.md") 839 if not json_file: 840 json_file = os.path.join(tmpdir, "stats.json") 841 842 # Create folders 843 if not os.path.exists(os.path.dirname(output_file)): 844 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 845 if not os.path.exists(os.path.dirname(json_file)): 846 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 847 848 # Create stats JSON file 849 stats_file = self.stats_to_file(file=json_file) 850 851 # Print stats file 852 with open(stats_file) as f: 853 stats = yaml.safe_load(f) 854 855 # Output 856 output_title = [] 857 output_index = [] 858 output = [] 859 860 # Title 861 output_title.append("# HOWARD Stats") 862 863 # Index 864 output_index.append("## Index") 865 866 # Process sections 867 for section in stats: 868 infos = stats.get(section) 869 section_link = "#" + section.lower().replace(" ", "-") 870 output.append(f"## {section}") 871 output_index.append(f"- [{section}]({section_link})") 872 873 if len(infos): 874 for info in infos: 875 try: 876 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 877 is_df = True 878 except: 879 try: 880 df = pd.DataFrame.from_dict( 881 json.loads((infos.get(info))), orient="index" 882 ) 883 is_df = True 884 except: 885 is_df = False 886 if is_df: 887 output.append(f"### {info}") 888 info_link = "#" + info.lower().replace(" ", "-") 889 output_index.append(f" - [{info}]({info_link})") 890 output.append(f"{df.to_markdown(index=False)}") 891 else: 892 output.append(f"- {info}: {infos.get(info)}") 893 else: 894 output.append(f"NA") 895 896 # Write stats in markdown file 897 with open(output_file, "w") as fp: 898 for item in output_title: 899 fp.write("%s\n" % item) 900 for item in output_index: 901 fp.write("%s\n" % item) 902 for item in output: 903 fp.write("%s\n" % item) 904 905 # Output stats in markdown 906 print("") 907 print("\n\n".join(output_title)) 908 print("") 909 print("\n\n".join(output)) 910 print("") 911 912 return None 913 914 def get_input(self) -> str: 915 """ 916 It returns the value of the input variable. 917 :return: The input is being returned. 918 """ 919 return self.input 920 921 def get_input_format(self, input_file: str = None) -> str: 922 """ 923 This function returns the format of the input variable, either from the provided input file or 924 by prompting for input. 925 926 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 927 represents the file path of the input file. If no `input_file` is provided when calling the 928 method, it will default to `None` 929 :type input_file: str 930 :return: The format of the input variable is being returned. 931 """ 932 933 if not input_file: 934 input_file = self.get_input() 935 input_format = get_file_format(input_file) 936 return input_format 937 938 def get_input_compressed(self, input_file: str = None) -> str: 939 """ 940 The function `get_input_compressed` returns the format of the input variable after compressing 941 it. 942 943 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 944 that represents the file path of the input file. If no `input_file` is provided when calling the 945 method, it will default to `None` and the method will then call `self.get_input()` to 946 :type input_file: str 947 :return: The function `get_input_compressed` returns the compressed format of the input 948 variable. 949 """ 950 951 if not input_file: 952 input_file = self.get_input() 953 input_compressed = get_file_compressed(input_file) 954 return input_compressed 955 956 def get_output(self) -> str: 957 """ 958 It returns the output of the neuron. 959 :return: The output of the neural network. 960 """ 961 962 return self.output 963 964 def get_output_format(self, output_file: str = None) -> str: 965 """ 966 The function `get_output_format` returns the format of the input variable or the output file if 967 provided. 968 969 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 970 that represents the file path of the output file. If no `output_file` is provided when calling 971 the method, it will default to the output obtained from the `get_output` method of the class 972 instance. The 973 :type output_file: str 974 :return: The format of the input variable is being returned. 975 """ 976 977 if not output_file: 978 output_file = self.get_output() 979 output_format = get_file_format(output_file) 980 981 return output_format 982 983 def get_config(self) -> dict: 984 """ 985 It returns the config 986 :return: The config variable is being returned. 987 """ 988 return self.config 989 990 def get_param(self) -> dict: 991 """ 992 It returns the param 993 :return: The param variable is being returned. 994 """ 995 return self.param 996 997 def get_connexion_db(self) -> str: 998 """ 999 It returns the connexion_db attribute of the object 1000 :return: The connexion_db is being returned. 1001 """ 1002 return self.connexion_db 1003 1004 def get_prefix(self) -> str: 1005 """ 1006 It returns the prefix of the object. 1007 :return: The prefix is being returned. 1008 """ 1009 return self.prefix 1010 1011 def get_table_variants(self, clause: str = "select") -> str: 1012 """ 1013 This function returns the table_variants attribute of the object 1014 1015 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1016 defaults to select (optional) 1017 :return: The table_variants attribute of the object. 1018 """ 1019 1020 # Access 1021 access = self.get_config().get("access", None) 1022 1023 # Clauses "select", "where", "update" 1024 if clause in ["select", "where", "update"]: 1025 table_variants = self.table_variants 1026 # Clause "from" 1027 elif clause in ["from"]: 1028 # For Read Only 1029 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1030 input_file = self.get_input() 1031 table_variants = f"'{input_file}' as variants" 1032 # For Read Write 1033 else: 1034 table_variants = f"{self.table_variants} as variants" 1035 else: 1036 table_variants = self.table_variants 1037 return table_variants 1038 1039 def get_tmp_dir(self) -> str: 1040 """ 1041 The function `get_tmp_dir` returns the temporary directory path based on configuration 1042 parameters or a default path. 1043 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1044 configuration, parameters, and a default value of "/tmp". 1045 """ 1046 1047 return get_tmp( 1048 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1049 ) 1050 1051 def get_connexion_type(self) -> str: 1052 """ 1053 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1054 1055 :return: The connexion type is being returned. 1056 """ 1057 return self.get_config().get("connexion_type", "memory") 1058 1059 def get_connexion(self): 1060 """ 1061 It returns the connection object 1062 1063 :return: The connection object. 1064 """ 1065 return self.conn 1066 1067 def close_connexion(self) -> None: 1068 """ 1069 This function closes the connection to the database. 1070 :return: The connection is being closed. 1071 """ 1072 return self.conn.close() 1073 1074 def get_header(self, type: str = "vcf"): 1075 """ 1076 This function returns the header of the VCF file as a list of strings 1077 1078 :param type: the type of header you want to get, defaults to vcf (optional) 1079 :return: The header of the vcf file. 1080 """ 1081 1082 if self.header_vcf: 1083 if type == "vcf": 1084 return self.header_vcf 1085 elif type == "list": 1086 return self.header_list 1087 else: 1088 if type == "vcf": 1089 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1090 return header 1091 elif type == "list": 1092 return vcf_required 1093 1094 def get_header_length(self, file: str = None) -> int: 1095 """ 1096 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1097 line. 1098 1099 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1100 header file. If this argument is provided, the function will read the header from the specified 1101 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1102 :type file: str 1103 :return: the length of the header list, excluding the #CHROM line. 1104 """ 1105 1106 if file: 1107 return len(self.read_vcf_header_file(file=file)) - 1 1108 elif self.get_header(type="list"): 1109 return len(self.get_header(type="list")) - 1 1110 else: 1111 return 0 1112 1113 def get_header_columns(self) -> str: 1114 """ 1115 This function returns the header list of a VCF 1116 1117 :return: The length of the header list. 1118 """ 1119 if self.get_header(): 1120 return self.get_header(type="list")[-1] 1121 else: 1122 return "" 1123 1124 def get_header_columns_as_list(self) -> list: 1125 """ 1126 This function returns the header list of a VCF 1127 1128 :return: The length of the header list. 1129 """ 1130 if self.get_header(): 1131 return self.get_header_columns().strip().split("\t") 1132 else: 1133 return [] 1134 1135 def get_header_columns_as_sql(self) -> str: 1136 """ 1137 This function retruns header length (without #CHROM line) 1138 1139 :return: The length of the header list. 1140 """ 1141 sql_column_list = [] 1142 for col in self.get_header_columns_as_list(): 1143 sql_column_list.append(f'"{col}"') 1144 return ",".join(sql_column_list) 1145 1146 def get_header_sample_list( 1147 self, check: bool = False, samples: list = None, samples_force: bool = False 1148 ) -> list: 1149 """ 1150 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1151 checking and filtering based on input parameters. 1152 1153 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1154 parameter that determines whether to check if the samples in the list are properly defined as 1155 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1156 list is defined as a, defaults to False 1157 :type check: bool (optional) 1158 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1159 allows you to specify a subset of samples from the header. If you provide a list of sample 1160 names, the function will check if each sample is defined in the header. If a sample is not found 1161 in the 1162 :type samples: list 1163 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1164 a boolean parameter that determines whether to force the function to return the sample list 1165 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1166 function will return the sample list without performing, defaults to False 1167 :type samples_force: bool (optional) 1168 :return: The function `get_header_sample_list` returns a list of samples based on the input 1169 parameters and conditions specified in the function. 1170 """ 1171 1172 # Init 1173 samples_list = [] 1174 1175 if samples is None: 1176 samples_list = self.header_vcf.samples 1177 else: 1178 samples_checked = [] 1179 for sample in samples: 1180 if sample in self.header_vcf.samples: 1181 samples_checked.append(sample) 1182 else: 1183 log.warning(f"Sample '{sample}' not defined in header") 1184 samples_list = samples_checked 1185 1186 # Force sample list without checking if is_genotype_column 1187 if samples_force: 1188 log.warning(f"Samples {samples_list} not checked if genotypes") 1189 return samples_list 1190 1191 if check: 1192 samples_checked = [] 1193 for sample in samples_list: 1194 if self.is_genotype_column(column=sample): 1195 samples_checked.append(sample) 1196 else: 1197 log.warning( 1198 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1199 ) 1200 samples_list = samples_checked 1201 1202 # Return samples list 1203 return samples_list 1204 1205 def is_genotype_column(self, column: str = None) -> bool: 1206 """ 1207 This function checks if a given column is a genotype column in a database. 1208 1209 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1210 represents the column name in a database table. This method checks if the specified column is a 1211 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1212 method of 1213 :type column: str 1214 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1215 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1216 column name and returns the result. If the `column` parameter is None, it returns False. 1217 """ 1218 1219 if column is not None: 1220 return Database(database=self.get_input()).is_genotype_column(column=column) 1221 else: 1222 return False 1223 1224 def get_verbose(self) -> bool: 1225 """ 1226 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1227 exist 1228 1229 :return: The value of the key "verbose" in the config dictionary. 1230 """ 1231 return self.get_config().get("verbose", False) 1232 1233 def get_connexion_format(self) -> str: 1234 """ 1235 It returns the connexion format of the object. 1236 :return: The connexion_format is being returned. 1237 """ 1238 connexion_format = self.connexion_format 1239 if connexion_format not in ["duckdb", "sqlite"]: 1240 log.error(f"Unknown connexion format {connexion_format}") 1241 raise ValueError(f"Unknown connexion format {connexion_format}") 1242 else: 1243 return connexion_format 1244 1245 def insert_file_to_table( 1246 self, 1247 file, 1248 columns: str, 1249 header_len: int = 0, 1250 sep: str = "\t", 1251 chunksize: int = 1000000, 1252 ) -> None: 1253 """ 1254 The function reads a file in chunks and inserts each chunk into a table based on the specified 1255 database format. 1256 1257 :param file: The `file` parameter is the file that you want to load into a table. It should be 1258 the path to the file on your system 1259 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1260 should contain the names of the columns in the table where the data will be inserted. The column 1261 names should be separated by commas within the string. For example, if you have columns named 1262 "id", "name 1263 :type columns: str 1264 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1265 the number of lines to skip at the beginning of the file before reading the actual data. This 1266 parameter allows you to skip any header information present in the file before processing the 1267 data, defaults to 0 1268 :type header_len: int (optional) 1269 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1270 separator character that is used in the file being read. In this case, the default separator is 1271 set to `\t`, which represents a tab character. You can change this parameter to a different 1272 separator character if, defaults to \t 1273 :type sep: str (optional) 1274 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1275 when processing the file in chunks. In the provided code snippet, the default value for 1276 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1277 to 1000000 1278 :type chunksize: int (optional) 1279 """ 1280 1281 # Config 1282 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1283 connexion_format = self.get_connexion_format() 1284 1285 log.debug("chunksize: " + str(chunksize)) 1286 1287 if chunksize: 1288 for chunk in pd.read_csv( 1289 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1290 ): 1291 if connexion_format in ["duckdb"]: 1292 sql_insert_into = ( 1293 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1294 ) 1295 self.conn.execute(sql_insert_into) 1296 elif connexion_format in ["sqlite"]: 1297 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1298 1299 def load_data( 1300 self, 1301 input_file: str = None, 1302 drop_variants_table: bool = False, 1303 sample_size: int = 20480, 1304 ) -> None: 1305 """ 1306 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1307 table before loading the data and specify a sample size. 1308 1309 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1310 table 1311 :type input_file: str 1312 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1313 determines whether the variants table should be dropped before loading the data. If set to 1314 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1315 not be dropped, defaults to False 1316 :type drop_variants_table: bool (optional) 1317 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1318 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1319 20480 1320 :type sample_size: int (optional) 1321 """ 1322 1323 log.info("Loading...") 1324 1325 # change input file 1326 if input_file: 1327 self.set_input(input_file) 1328 self.set_header() 1329 1330 # drop variants table 1331 if drop_variants_table: 1332 self.drop_variants_table() 1333 1334 # get table variants 1335 table_variants = self.get_table_variants() 1336 1337 # Access 1338 access = self.get_config().get("access", None) 1339 log.debug(f"access: {access}") 1340 1341 # Input format and compress 1342 input_format = self.get_input_format() 1343 input_compressed = self.get_input_compressed() 1344 log.debug(f"input_format: {input_format}") 1345 log.debug(f"input_compressed: {input_compressed}") 1346 1347 # input_compressed_format 1348 if input_compressed: 1349 input_compressed_format = "gzip" 1350 else: 1351 input_compressed_format = "none" 1352 log.debug(f"input_compressed_format: {input_compressed_format}") 1353 1354 # Connexion format 1355 connexion_format = self.get_connexion_format() 1356 1357 # Sample size 1358 if not sample_size: 1359 sample_size = -1 1360 log.debug(f"sample_size: {sample_size}") 1361 1362 # Load data 1363 log.debug(f"Load Data from {input_format}") 1364 1365 # DuckDB connexion 1366 if connexion_format in ["duckdb"]: 1367 1368 # Database already exists 1369 if self.input_format in ["db", "duckdb"]: 1370 1371 if connexion_format in ["duckdb"]: 1372 log.debug(f"Input file format '{self.input_format}' duckDB") 1373 else: 1374 log.error( 1375 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1376 ) 1377 raise ValueError( 1378 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1379 ) 1380 1381 # Load from existing database format 1382 else: 1383 1384 try: 1385 # Create Table or View 1386 database = Database(database=self.input) 1387 sql_from = database.get_sql_from(sample_size=sample_size) 1388 1389 if access in ["RO"]: 1390 sql_load = ( 1391 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1392 ) 1393 else: 1394 sql_load = ( 1395 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1396 ) 1397 self.conn.execute(sql_load) 1398 1399 except: 1400 # Format not available 1401 log.error(f"Input file format '{self.input_format}' not available") 1402 raise ValueError( 1403 f"Input file format '{self.input_format}' not available" 1404 ) 1405 1406 # SQLite connexion 1407 elif connexion_format in ["sqlite"] and input_format in [ 1408 "vcf", 1409 "tsv", 1410 "csv", 1411 "psv", 1412 ]: 1413 1414 # Main structure 1415 structure = { 1416 "#CHROM": "VARCHAR", 1417 "POS": "INTEGER", 1418 "ID": "VARCHAR", 1419 "REF": "VARCHAR", 1420 "ALT": "VARCHAR", 1421 "QUAL": "VARCHAR", 1422 "FILTER": "VARCHAR", 1423 "INFO": "VARCHAR", 1424 } 1425 1426 # Strcuture with samples 1427 structure_complete = structure 1428 if self.get_header_sample_list(): 1429 structure["FORMAT"] = "VARCHAR" 1430 for sample in self.get_header_sample_list(): 1431 structure_complete[sample] = "VARCHAR" 1432 1433 # Columns list for create and insert 1434 sql_create_table_columns = [] 1435 sql_create_table_columns_list = [] 1436 for column in structure_complete: 1437 column_type = structure_complete[column] 1438 sql_create_table_columns.append( 1439 f'"{column}" {column_type} default NULL' 1440 ) 1441 sql_create_table_columns_list.append(f'"{column}"') 1442 1443 # Create database 1444 log.debug(f"Create Table {table_variants}") 1445 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1446 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1447 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1448 self.conn.execute(sql_create_table) 1449 1450 # chunksize define length of file chunk load file 1451 chunksize = 100000 1452 1453 # delimiter 1454 delimiter = file_format_delimiters.get(input_format, "\t") 1455 1456 # Load the input file 1457 with open(self.input, "rt") as input_file: 1458 1459 # Use the appropriate file handler based on the input format 1460 if input_compressed: 1461 input_file = bgzf.open(self.input, "rt") 1462 if input_format in ["vcf"]: 1463 header_len = self.get_header_length() 1464 else: 1465 header_len = 0 1466 1467 # Insert the file contents into a table 1468 self.insert_file_to_table( 1469 input_file, 1470 columns=sql_create_table_columns_list_sql, 1471 header_len=header_len, 1472 sep=delimiter, 1473 chunksize=chunksize, 1474 ) 1475 1476 else: 1477 log.error( 1478 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1479 ) 1480 raise ValueError( 1481 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1482 ) 1483 1484 # Explode INFOS fields into table fields 1485 if self.get_explode_infos(): 1486 self.explode_infos( 1487 prefix=self.get_explode_infos_prefix(), 1488 fields=self.get_explode_infos_fields(), 1489 force=True, 1490 ) 1491 1492 # Create index after insertion 1493 self.create_indexes() 1494 1495 def get_explode_infos(self) -> bool: 1496 """ 1497 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1498 to False if it is not set. 1499 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1500 value. If the parameter is not present, it will return False. 1501 """ 1502 1503 return self.get_param().get("explode", {}).get("explode_infos", False) 1504 1505 def get_explode_infos_fields( 1506 self, 1507 explode_infos_fields: str = None, 1508 remove_fields_not_in_header: bool = False, 1509 ) -> list: 1510 """ 1511 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1512 the input parameter `explode_infos_fields`. 1513 1514 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1515 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1516 comma-separated list of field names to explode 1517 :type explode_infos_fields: str 1518 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1519 flag that determines whether to remove fields that are not present in the header. If it is set 1520 to `True`, any field that is not in the header will be excluded from the list of exploded 1521 information fields. If it is set to `, defaults to False 1522 :type remove_fields_not_in_header: bool (optional) 1523 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1524 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1525 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1526 Otherwise, it returns a list of exploded information fields after removing any spaces and 1527 splitting the string by commas. 1528 """ 1529 1530 # If no fields, get it in param 1531 if not explode_infos_fields: 1532 explode_infos_fields = ( 1533 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1534 ) 1535 1536 # If no fields, defined as all fields in header using keyword 1537 if not explode_infos_fields: 1538 explode_infos_fields = "*" 1539 1540 # If fields list not empty 1541 if explode_infos_fields: 1542 1543 # Input fields list 1544 if isinstance(explode_infos_fields, str): 1545 fields_input = explode_infos_fields.split(",") 1546 elif isinstance(explode_infos_fields, list): 1547 fields_input = explode_infos_fields 1548 else: 1549 fields_input = [] 1550 1551 # Fields list without * keyword 1552 fields_without_all = fields_input.copy() 1553 if "*".casefold() in (item.casefold() for item in fields_without_all): 1554 fields_without_all.remove("*") 1555 1556 # Fields in header 1557 fields_in_header = sorted(list(set(self.get_header().infos))) 1558 1559 # Construct list of fields 1560 fields_output = [] 1561 for field in fields_input: 1562 1563 # Strip field 1564 field = field.strip() 1565 1566 # format keyword * in regex 1567 if field.upper() in ["*"]: 1568 field = ".*" 1569 1570 # Find all fields with pattern 1571 r = re.compile(field) 1572 fields_search = sorted(list(filter(r.match, fields_in_header))) 1573 1574 # Remove fields input from search 1575 if field in fields_search: 1576 fields_search = [field] 1577 elif fields_search != [field]: 1578 fields_search = sorted( 1579 list(set(fields_search).difference(fields_input)) 1580 ) 1581 1582 # If field is not in header (avoid not well formatted header) 1583 if not fields_search and not remove_fields_not_in_header: 1584 fields_search = [field] 1585 1586 # Add found fields 1587 for new_field in fields_search: 1588 # Add field, if not already exists, and if it is in header (if asked) 1589 if ( 1590 new_field not in fields_output 1591 and ( 1592 not remove_fields_not_in_header 1593 or new_field in fields_in_header 1594 ) 1595 and new_field not in [".*"] 1596 ): 1597 fields_output.append(new_field) 1598 1599 return fields_output 1600 1601 else: 1602 1603 return [] 1604 1605 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1606 """ 1607 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1608 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1609 not provided. 1610 1611 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1612 prefix to be used for exploding or expanding information 1613 :type explode_infos_prefix: str 1614 :return: the value of the variable `explode_infos_prefix`. 1615 """ 1616 1617 if not explode_infos_prefix: 1618 explode_infos_prefix = ( 1619 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1620 ) 1621 1622 return explode_infos_prefix 1623 1624 def add_column( 1625 self, 1626 table_name, 1627 column_name, 1628 column_type, 1629 default_value=None, 1630 drop: bool = False, 1631 ) -> dict: 1632 """ 1633 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1634 doesn't already exist. 1635 1636 :param table_name: The name of the table to which you want to add a column 1637 :param column_name: The parameter "column_name" is the name of the column that you want to add 1638 to the table 1639 :param column_type: The `column_type` parameter specifies the data type of the column that you 1640 want to add to the table. It should be a string that represents the desired data type, such as 1641 "INTEGER", "TEXT", "REAL", etc 1642 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1643 default value for the newly added column. If a default value is provided, it will be assigned to 1644 the column for any existing rows that do not have a value for that column 1645 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1646 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1647 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1648 to False 1649 :type drop: bool (optional) 1650 :return: a boolean value indicating whether the column was successfully added to the table. 1651 """ 1652 1653 # added 1654 added = False 1655 dropped = False 1656 1657 # Check if the column already exists in the table 1658 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1659 columns = self.get_query_to_df(query).columns.tolist() 1660 if column_name.upper() in [c.upper() for c in columns]: 1661 log.debug( 1662 f"The {column_name} column already exists in the {table_name} table" 1663 ) 1664 if drop: 1665 self.drop_column(table_name=table_name, column_name=column_name) 1666 dropped = True 1667 else: 1668 return None 1669 else: 1670 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1671 1672 # Add column in table 1673 add_column_query = ( 1674 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1675 ) 1676 if default_value is not None: 1677 add_column_query += f" DEFAULT {default_value}" 1678 self.execute_query(add_column_query) 1679 added = not dropped 1680 log.debug( 1681 f"The {column_name} column was successfully added to the {table_name} table" 1682 ) 1683 1684 if added: 1685 added_column = { 1686 "table_name": table_name, 1687 "column_name": column_name, 1688 "column_type": column_type, 1689 "default_value": default_value, 1690 } 1691 else: 1692 added_column = None 1693 1694 return added_column 1695 1696 def drop_column( 1697 self, column: dict = None, table_name: str = None, column_name: str = None 1698 ) -> bool: 1699 """ 1700 The `drop_column` function drops a specified column from a given table in a database and returns 1701 True if the column was successfully dropped, and False if the column does not exist in the 1702 table. 1703 1704 :param column: The `column` parameter is a dictionary that contains information about the column 1705 you want to drop. It has two keys: 1706 :type column: dict 1707 :param table_name: The `table_name` parameter is the name of the table from which you want to 1708 drop a column 1709 :type table_name: str 1710 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1711 from the table 1712 :type column_name: str 1713 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1714 and False if the column does not exist in the table. 1715 """ 1716 1717 # Find column infos 1718 if column: 1719 if isinstance(column, dict): 1720 table_name = column.get("table_name", None) 1721 column_name = column.get("column_name", None) 1722 elif isinstance(column, str): 1723 table_name = self.get_table_variants() 1724 column_name = column 1725 else: 1726 table_name = None 1727 column_name = None 1728 1729 if not table_name and not column_name: 1730 return False 1731 1732 # Removed 1733 removed = False 1734 1735 # Check if the column already exists in the table 1736 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1737 columns = self.get_query_to_df(query).columns.tolist() 1738 if column_name in columns: 1739 log.debug(f"The {column_name} column exists in the {table_name} table") 1740 else: 1741 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1742 return False 1743 1744 # Add column in table # ALTER TABLE integers DROP k 1745 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1746 self.execute_query(add_column_query) 1747 removed = True 1748 log.debug( 1749 f"The {column_name} column was successfully dropped to the {table_name} table" 1750 ) 1751 1752 return removed 1753 1754 def explode_infos( 1755 self, 1756 prefix: str = None, 1757 create_index: bool = False, 1758 fields: list = None, 1759 force: bool = False, 1760 proccess_all_fields_together: bool = False, 1761 table: str = None, 1762 ) -> list: 1763 """ 1764 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1765 individual columns, returning a list of added columns. 1766 1767 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1768 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1769 `self.get_explode_infos_prefix()` as the prefix 1770 :type prefix: str 1771 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1772 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1773 `False`, indexes will not be created. The default value is `False`, defaults to False 1774 :type create_index: bool (optional) 1775 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1776 that you want to explode into individual columns. If this parameter is not provided, all INFO 1777 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1778 a list to the ` 1779 :type fields: list 1780 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1781 determines whether to drop and recreate a column if it already exists in the table. If `force` 1782 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1783 defaults to False 1784 :type force: bool (optional) 1785 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1786 flag that determines whether to process all the INFO fields together or individually. If set to 1787 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1788 be processed individually. The default value is, defaults to False 1789 :type proccess_all_fields_together: bool (optional) 1790 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1791 of the table where the exploded INFO fields will be added as individual columns. If you provide 1792 a value for the `table` parameter, the function will use that table name. If the `table` 1793 parameter is 1794 :type table: str 1795 :return: The `explode_infos` function returns a list of added columns. 1796 """ 1797 1798 # drop indexes 1799 self.drop_indexes() 1800 1801 # connexion format 1802 connexion_format = self.get_connexion_format() 1803 1804 # Access 1805 access = self.get_config().get("access", None) 1806 1807 # Added columns 1808 added_columns = [] 1809 1810 if access not in ["RO"]: 1811 1812 # prefix 1813 if prefix in [None, True] or not isinstance(prefix, str): 1814 if self.get_explode_infos_prefix() not in [None, True]: 1815 prefix = self.get_explode_infos_prefix() 1816 else: 1817 prefix = "INFO/" 1818 1819 # table variants 1820 if table is not None: 1821 table_variants = table 1822 else: 1823 table_variants = self.get_table_variants(clause="select") 1824 1825 # extra infos 1826 try: 1827 extra_infos = self.get_extra_infos() 1828 except: 1829 extra_infos = [] 1830 1831 # Header infos 1832 header_infos = self.get_header().infos 1833 1834 log.debug( 1835 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1836 ) 1837 1838 sql_info_alter_table_array = [] 1839 1840 # Info fields to check 1841 fields_list = list(header_infos) 1842 if fields: 1843 fields_list += fields 1844 fields_list = set(fields_list) 1845 1846 # If no fields 1847 if not fields: 1848 fields = [] 1849 1850 # Translate fields if patterns 1851 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1852 1853 for info in fields: 1854 1855 info_id_sql = prefix + info 1856 1857 if ( 1858 info in fields_list 1859 or prefix + info in fields_list 1860 or info in extra_infos 1861 ): 1862 1863 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1864 1865 if info in header_infos: 1866 info_type = header_infos[info].type 1867 info_num = header_infos[info].num 1868 else: 1869 info_type = "String" 1870 info_num = 0 1871 1872 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1873 if info_num != 1: 1874 type_sql = "VARCHAR" 1875 1876 # Add field 1877 added_column = self.add_column( 1878 table_name=table_variants, 1879 column_name=info_id_sql, 1880 column_type=type_sql, 1881 default_value="null", 1882 drop=force, 1883 ) 1884 1885 if added_column: 1886 added_columns.append(added_column) 1887 1888 if added_column or force: 1889 1890 # add field to index 1891 self.index_additionnal_fields.append(info_id_sql) 1892 1893 # Update field array 1894 if connexion_format in ["duckdb"]: 1895 update_info_field = f""" 1896 "{info_id_sql}" = 1897 CASE 1898 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1899 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1900 END 1901 """ 1902 elif connexion_format in ["sqlite"]: 1903 update_info_field = f""" 1904 "{info_id_sql}" = 1905 CASE 1906 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1907 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1908 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1909 END 1910 """ 1911 1912 sql_info_alter_table_array.append(update_info_field) 1913 1914 if sql_info_alter_table_array: 1915 1916 # By chromosomes 1917 try: 1918 chromosomes_list = list( 1919 self.get_query_to_df( 1920 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1921 )["#CHROM"] 1922 ) 1923 except: 1924 chromosomes_list = [None] 1925 1926 for chrom in chromosomes_list: 1927 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1928 1929 # Where clause 1930 where_clause = "" 1931 if chrom and len(chromosomes_list) > 1: 1932 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1933 1934 # Update table 1935 if proccess_all_fields_together: 1936 sql_info_alter_table_array_join = ", ".join( 1937 sql_info_alter_table_array 1938 ) 1939 if sql_info_alter_table_array_join: 1940 sql_info_alter_table = f""" 1941 UPDATE {table_variants} 1942 SET {sql_info_alter_table_array_join} 1943 {where_clause} 1944 """ 1945 log.debug( 1946 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1947 ) 1948 # log.debug(sql_info_alter_table) 1949 self.conn.execute(sql_info_alter_table) 1950 else: 1951 sql_info_alter_num = 0 1952 for sql_info_alter in sql_info_alter_table_array: 1953 sql_info_alter_num += 1 1954 sql_info_alter_table = f""" 1955 UPDATE {table_variants} 1956 SET {sql_info_alter} 1957 {where_clause} 1958 """ 1959 log.debug( 1960 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1961 ) 1962 # log.debug(sql_info_alter_table) 1963 self.conn.execute(sql_info_alter_table) 1964 1965 # create indexes 1966 if create_index: 1967 self.create_indexes() 1968 1969 return added_columns 1970 1971 def create_indexes(self) -> None: 1972 """ 1973 Create indexes on the table after insertion 1974 """ 1975 1976 # Access 1977 access = self.get_config().get("access", None) 1978 1979 # get table variants 1980 table_variants = self.get_table_variants("FROM") 1981 1982 if self.get_indexing() and access not in ["RO"]: 1983 # Create index 1984 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1985 self.conn.execute(sql_create_table_index) 1986 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1987 self.conn.execute(sql_create_table_index) 1988 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1989 self.conn.execute(sql_create_table_index) 1990 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1991 self.conn.execute(sql_create_table_index) 1992 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1993 self.conn.execute(sql_create_table_index) 1994 for field in self.index_additionnal_fields: 1995 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1996 self.conn.execute(sql_create_table_index) 1997 1998 def drop_indexes(self) -> None: 1999 """ 2000 Create indexes on the table after insertion 2001 """ 2002 2003 # Access 2004 access = self.get_config().get("access", None) 2005 2006 # get table variants 2007 table_variants = self.get_table_variants("FROM") 2008 2009 # Get database format 2010 connexion_format = self.get_connexion_format() 2011 2012 if access not in ["RO"]: 2013 if connexion_format in ["duckdb"]: 2014 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2015 elif connexion_format in ["sqlite"]: 2016 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2017 2018 list_indexes = self.conn.execute(sql_list_indexes) 2019 index_names = [row[0] for row in list_indexes.fetchall()] 2020 for index in index_names: 2021 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2022 self.conn.execute(sql_drop_table_index) 2023 2024 def read_vcf_header(self, f) -> list: 2025 """ 2026 It reads the header of a VCF file and returns a list of the header lines 2027 2028 :param f: the file object 2029 :return: The header lines of the VCF file. 2030 """ 2031 2032 header_list = [] 2033 for line in f: 2034 header_list.append(line) 2035 if line.startswith("#CHROM"): 2036 break 2037 return header_list 2038 2039 def read_vcf_header_file(self, file: str = None) -> list: 2040 """ 2041 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2042 uncompressed files. 2043 2044 :param file: The `file` parameter is a string that represents the path to the VCF header file 2045 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2046 default to `None` 2047 :type file: str 2048 :return: The function `read_vcf_header_file` returns a list. 2049 """ 2050 2051 if self.get_input_compressed(input_file=file): 2052 with bgzf.open(file, "rt") as f: 2053 return self.read_vcf_header(f=f) 2054 else: 2055 with open(file, "rt") as f: 2056 return self.read_vcf_header(f=f) 2057 2058 def execute_query(self, query: str): 2059 """ 2060 It takes a query as an argument, executes it, and returns the results 2061 2062 :param query: The query to be executed 2063 :return: The result of the query is being returned. 2064 """ 2065 if query: 2066 return self.conn.execute(query) # .fetchall() 2067 else: 2068 return None 2069 2070 def export_output( 2071 self, 2072 output_file: str | None = None, 2073 output_header: str | None = None, 2074 export_header: bool = True, 2075 query: str | None = None, 2076 parquet_partitions: list | None = None, 2077 chunk_size: int | None = None, 2078 threads: int | None = None, 2079 sort: bool = False, 2080 index: bool = False, 2081 order_by: str | None = None, 2082 ) -> bool: 2083 """ 2084 The `export_output` function exports data from a VCF file to a specified output file in various 2085 formats, including VCF, CSV, TSV, PSV, and Parquet. 2086 2087 :param output_file: The `output_file` parameter is a string that specifies the name of the 2088 output file to be generated by the function. This is where the exported data will be saved 2089 :type output_file: str 2090 :param output_header: The `output_header` parameter is a string that specifies the name of the 2091 file where the header of the VCF file will be exported. If this parameter is not provided, the 2092 header will be exported to a file with the same name as the `output_file` parameter, but with 2093 the extension " 2094 :type output_header: str 2095 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2096 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2097 True, the header will be exported to a file. If `export_header` is False, the header will not 2098 be, defaults to True, if output format is not VCF 2099 :type export_header: bool (optional) 2100 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2101 select specific data from the VCF file before exporting it. If provided, only the data that 2102 matches the query will be exported 2103 :type query: str 2104 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2105 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2106 organize data in a hierarchical directory structure based on the values of one or more columns. 2107 This can improve query performance when working with large datasets 2108 :type parquet_partitions: list 2109 :param chunk_size: The `chunk_size` parameter specifies the number of 2110 records in batch when exporting data in Parquet format. This parameter is used for 2111 partitioning the Parquet file into multiple files. 2112 :type chunk_size: int 2113 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2114 threads to be used during the export process. It determines the level of parallelism and can 2115 improve the performance of the export operation. If not provided, the function will use the 2116 default number of threads 2117 :type threads: int 2118 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2119 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2120 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2121 False 2122 :type sort: bool (optional) 2123 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2124 created on the output file. If `index` is True, an index will be created. If `index` is False, 2125 no index will be created. The default value is False, defaults to False 2126 :type index: bool (optional) 2127 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2128 sorting the output file. This parameter is only applicable when exporting data in VCF format 2129 :type order_by: str 2130 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2131 None if it doesn't. 2132 """ 2133 2134 # Log 2135 log.info("Exporting...") 2136 2137 # Full path 2138 output_file = full_path(output_file) 2139 output_header = full_path(output_header) 2140 2141 # Config 2142 config = self.get_config() 2143 2144 # Param 2145 param = self.get_param() 2146 2147 # Tmp files to remove 2148 tmp_to_remove = [] 2149 2150 # If no output, get it 2151 if not output_file: 2152 output_file = self.get_output() 2153 2154 # If not threads 2155 if not threads: 2156 threads = self.get_threads() 2157 2158 # Auto header name with extension 2159 if export_header or output_header: 2160 if not output_header: 2161 output_header = f"{output_file}.hdr" 2162 # Export header 2163 self.export_header(output_file=output_file) 2164 2165 # Switch off export header if VCF output 2166 output_file_type = get_file_format(output_file) 2167 if output_file_type in ["vcf"]: 2168 export_header = False 2169 tmp_to_remove.append(output_header) 2170 2171 # Chunk size 2172 if not chunk_size: 2173 chunk_size = config.get("chunk_size", None) 2174 2175 # Parquet partition 2176 if not parquet_partitions: 2177 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2178 if parquet_partitions and isinstance(parquet_partitions, str): 2179 parquet_partitions = parquet_partitions.split(",") 2180 2181 # Order by 2182 if not order_by: 2183 order_by = param.get("export", {}).get("order_by", "") 2184 2185 # Header in output 2186 header_in_output = param.get("export", {}).get("include_header", False) 2187 2188 # Database 2189 database_source = self.get_connexion() 2190 2191 # Connexion format 2192 connexion_format = self.get_connexion_format() 2193 2194 # Explode infos 2195 if self.get_explode_infos(): 2196 self.explode_infos( 2197 prefix=self.get_explode_infos_prefix(), 2198 fields=self.get_explode_infos_fields(), 2199 force=False, 2200 ) 2201 2202 # if connexion_format in ["sqlite"] or query: 2203 if connexion_format in ["sqlite"]: 2204 2205 # Export in Parquet 2206 random_tmp = "".join( 2207 random.choice(string.ascii_lowercase) for i in range(10) 2208 ) 2209 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2210 tmp_to_remove.append(database_source) 2211 2212 # Table Variants 2213 table_variants = self.get_table_variants() 2214 2215 # Create export query 2216 sql_query_export_subquery = f""" 2217 SELECT * FROM {table_variants} 2218 """ 2219 2220 # Write source file 2221 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2222 2223 # Create database 2224 database = Database( 2225 database=database_source, 2226 table="variants", 2227 header_file=output_header, 2228 conn_config=self.get_connexion_config(), 2229 ) 2230 2231 # Existing colomns header 2232 existing_columns_header = database.get_header_columns_from_database() 2233 2234 # Sample list 2235 get_samples = self.get_samples() 2236 get_samples_check = self.get_samples_check() 2237 samples_force = get_samples is not None 2238 sample_list = self.get_header_sample_list( 2239 check=get_samples_check, samples=get_samples, samples_force=samples_force 2240 ) 2241 2242 # Export file 2243 database.export( 2244 output_database=output_file, 2245 output_header=output_header, 2246 existing_columns_header=existing_columns_header, 2247 parquet_partitions=parquet_partitions, 2248 chunk_size=chunk_size, 2249 threads=threads, 2250 sort=sort, 2251 index=index, 2252 header_in_output=header_in_output, 2253 order_by=order_by, 2254 query=query, 2255 export_header=export_header, 2256 sample_list=sample_list, 2257 ) 2258 2259 # Remove 2260 remove_if_exists(tmp_to_remove) 2261 2262 return (os.path.exists(output_file) or None) and ( 2263 os.path.exists(output_file) or None 2264 ) 2265 2266 def get_extra_infos(self, table: str = None) -> list: 2267 """ 2268 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2269 in the header. 2270 2271 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2272 name of the table from which you want to retrieve the extra columns that are not present in the 2273 header. If the `table` parameter is not provided when calling the function, it will default to 2274 using the variants 2275 :type table: str 2276 :return: A list of columns that are in the specified table but not in the header of the table. 2277 """ 2278 2279 header_columns = [] 2280 2281 if not table: 2282 table = self.get_table_variants(clause="from") 2283 header_columns = self.get_header_columns() 2284 2285 # Check all columns in the database 2286 query = f""" SELECT * FROM {table} LIMIT 1 """ 2287 log.debug(f"query {query}") 2288 table_columns = self.get_query_to_df(query).columns.tolist() 2289 extra_columns = [] 2290 2291 # Construct extra infos (not in header) 2292 for column in table_columns: 2293 if column not in header_columns: 2294 extra_columns.append(column) 2295 2296 return extra_columns 2297 2298 def get_extra_infos_sql(self, table: str = None) -> str: 2299 """ 2300 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2301 by double quotes 2302 2303 :param table: The name of the table to get the extra infos from. If None, the default table is 2304 used 2305 :type table: str 2306 :return: A string of the extra infos 2307 """ 2308 2309 return ", ".join( 2310 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2311 ) 2312 2313 def export_header( 2314 self, 2315 header_name: str = None, 2316 output_file: str = None, 2317 output_file_ext: str = ".hdr", 2318 clean_header: bool = True, 2319 remove_chrom_line: bool = False, 2320 ) -> str: 2321 """ 2322 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2323 specified options, and writes it to a new file. 2324 2325 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2326 this parameter is not specified, the header will be written to the output file 2327 :type header_name: str 2328 :param output_file: The `output_file` parameter in the `export_header` function is used to 2329 specify the name of the output file where the header will be written. If this parameter is not 2330 provided, the header will be written to a temporary file 2331 :type output_file: str 2332 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2333 string that represents the extension of the output header file. By default, it is set to ".hdr" 2334 if not specified by the user. This extension will be appended to the `output_file` name to 2335 create the final, defaults to .hdr 2336 :type output_file_ext: str (optional) 2337 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2338 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2339 `True`, the function will clean the header by modifying certain lines based on a specific 2340 pattern. If `clean_header`, defaults to True 2341 :type clean_header: bool (optional) 2342 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2343 boolean flag that determines whether the #CHROM line should be removed from the header before 2344 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2345 defaults to False 2346 :type remove_chrom_line: bool (optional) 2347 :return: The function `export_header` returns the name of the temporary header file that is 2348 created. 2349 """ 2350 2351 if not header_name and not output_file: 2352 output_file = self.get_output() 2353 2354 if self.get_header(): 2355 2356 # Get header object 2357 header_obj = self.get_header() 2358 2359 # Create database 2360 db_for_header = Database(database=self.get_input()) 2361 2362 # Get real columns in the file 2363 db_header_columns = db_for_header.get_columns() 2364 2365 with tempfile.TemporaryDirectory() as tmpdir: 2366 2367 # Write header file 2368 header_file_tmp = os.path.join(tmpdir, "header") 2369 f = open(header_file_tmp, "w") 2370 vcf.Writer(f, header_obj) 2371 f.close() 2372 2373 # Replace #CHROM line with rel columns 2374 header_list = db_for_header.read_header_file( 2375 header_file=header_file_tmp 2376 ) 2377 header_list[-1] = "\t".join(db_header_columns) 2378 2379 # Remove CHROM line 2380 if remove_chrom_line: 2381 header_list.pop() 2382 2383 # Clean header 2384 if clean_header: 2385 header_list_clean = [] 2386 for head in header_list: 2387 # Clean head for malformed header 2388 head_clean = head 2389 head_clean = re.subn( 2390 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2391 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2392 head_clean, 2393 2, 2394 )[0] 2395 # Write header 2396 header_list_clean.append(head_clean) 2397 header_list = header_list_clean 2398 2399 tmp_header_name = output_file + output_file_ext 2400 2401 f = open(tmp_header_name, "w") 2402 for line in header_list: 2403 f.write(line) 2404 f.close() 2405 2406 return tmp_header_name 2407 2408 def export_variant_vcf( 2409 self, 2410 vcf_file, 2411 remove_info: bool = False, 2412 add_samples: bool = True, 2413 list_samples: list = [], 2414 where_clause: str = "", 2415 index: bool = False, 2416 threads: int | None = None, 2417 ) -> bool | None: 2418 """ 2419 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2420 remove INFO field, add samples, and control compression and indexing. 2421 2422 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2423 written to. It is the output file that will contain the filtered VCF data based on the specified 2424 parameters 2425 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2426 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2427 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2428 in, defaults to False 2429 :type remove_info: bool (optional) 2430 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2431 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2432 If set to False, the samples will be removed. The default value is True, defaults to True 2433 :type add_samples: bool (optional) 2434 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2435 in the output VCF file. By default, all samples will be included. If you provide a list of 2436 samples, only those samples will be included in the output file 2437 :type list_samples: list 2438 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2439 determines whether or not to create an index for the output VCF file. If `index` is set to 2440 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2441 :type index: bool (optional) 2442 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2443 number of threads to use for exporting the VCF file. It determines how many parallel threads 2444 will be used during the export process. More threads can potentially speed up the export process 2445 by utilizing multiple cores of the processor. If 2446 :type threads: int | None 2447 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2448 method with various parameters including the output file, query, threads, sort flag, and index 2449 flag. The `export_output` method is responsible for exporting the VCF data based on the 2450 specified parameters and configurations provided in the `export_variant_vcf` function. 2451 """ 2452 2453 # Config 2454 config = self.get_config() 2455 2456 # Extract VCF 2457 log.debug("Export VCF...") 2458 2459 # Table variants 2460 table_variants = self.get_table_variants() 2461 2462 # Threads 2463 if not threads: 2464 threads = self.get_threads() 2465 2466 # Info fields 2467 if remove_info: 2468 if not isinstance(remove_info, str): 2469 remove_info = "." 2470 info_field = f"""'{remove_info}' as INFO""" 2471 else: 2472 info_field = "INFO" 2473 2474 # Samples fields 2475 if add_samples: 2476 if not list_samples: 2477 list_samples = self.get_header_sample_list() 2478 if list_samples: 2479 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2480 else: 2481 samples_fields = "" 2482 log.debug(f"samples_fields: {samples_fields}") 2483 else: 2484 samples_fields = "" 2485 2486 # Where clause 2487 if where_clause is None: 2488 where_clause = "" 2489 2490 # Variants 2491 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2492 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2493 log.debug(f"sql_query_select={sql_query_select}") 2494 2495 return self.export_output( 2496 output_file=vcf_file, 2497 output_header=None, 2498 export_header=True, 2499 query=sql_query_select, 2500 parquet_partitions=None, 2501 chunk_size=config.get("chunk_size", None), 2502 threads=threads, 2503 sort=True, 2504 index=index, 2505 order_by=None, 2506 ) 2507 2508 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2509 """ 2510 It takes a list of commands and runs them in parallel using the number of threads specified 2511 2512 :param commands: A list of commands to run 2513 :param threads: The number of threads to use, defaults to 1 (optional) 2514 """ 2515 2516 run_parallel_commands(commands, threads) 2517 2518 def get_threads(self, default: int = 1) -> int: 2519 """ 2520 This function returns the number of threads to use for a job, with a default value of 1 if not 2521 specified. 2522 2523 :param default: The `default` parameter in the `get_threads` method is used to specify the 2524 default number of threads to use if no specific value is provided. If no value is provided for 2525 the `threads` parameter in the configuration or input parameters, the `default` value will be 2526 used, defaults to 1 2527 :type default: int (optional) 2528 :return: the number of threads to use for the current job. 2529 """ 2530 2531 # Config 2532 config = self.get_config() 2533 2534 # Param 2535 param = self.get_param() 2536 2537 # Input threads 2538 input_thread = param.get("threads", config.get("threads", None)) 2539 2540 # Check threads 2541 if not input_thread: 2542 threads = default 2543 elif int(input_thread) <= 0: 2544 threads = os.cpu_count() 2545 else: 2546 threads = int(input_thread) 2547 return threads 2548 2549 def get_memory(self, default: str = None) -> str: 2550 """ 2551 This function retrieves the memory value from parameters or configuration with a default value 2552 if not found. 2553 2554 :param default: The `get_memory` function takes in a default value as a string parameter. This 2555 default value is used as a fallback in case the `memory` parameter is not provided in the 2556 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2557 the function 2558 :type default: str 2559 :return: The `get_memory` function returns a string value representing the memory parameter. If 2560 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2561 return the default value provided as an argument to the function. 2562 """ 2563 2564 # Config 2565 config = self.get_config() 2566 2567 # Param 2568 param = self.get_param() 2569 2570 # Input threads 2571 input_memory = param.get("memory", config.get("memory", None)) 2572 2573 # Check threads 2574 if input_memory: 2575 memory = input_memory 2576 else: 2577 memory = default 2578 2579 return memory 2580 2581 def update_from_vcf(self, vcf_file: str) -> None: 2582 """ 2583 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2584 2585 :param vcf_file: the path to the VCF file 2586 """ 2587 2588 connexion_format = self.get_connexion_format() 2589 2590 if connexion_format in ["duckdb"]: 2591 self.update_from_vcf_duckdb(vcf_file) 2592 elif connexion_format in ["sqlite"]: 2593 self.update_from_vcf_sqlite(vcf_file) 2594 2595 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2596 """ 2597 It takes a VCF file and updates the INFO column of the variants table in the database with the 2598 INFO column of the VCF file 2599 2600 :param vcf_file: the path to the VCF file 2601 """ 2602 2603 # varaints table 2604 table_variants = self.get_table_variants() 2605 2606 # Loading VCF into temporaire table 2607 skip = self.get_header_length(file=vcf_file) 2608 vcf_df = pd.read_csv( 2609 vcf_file, 2610 sep="\t", 2611 engine="c", 2612 skiprows=skip, 2613 header=0, 2614 low_memory=False, 2615 ) 2616 sql_query_update = f""" 2617 UPDATE {table_variants} as table_variants 2618 SET INFO = concat( 2619 CASE 2620 WHEN INFO NOT IN ('', '.') 2621 THEN INFO 2622 ELSE '' 2623 END, 2624 ( 2625 SELECT 2626 concat( 2627 CASE 2628 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2629 THEN ';' 2630 ELSE '' 2631 END 2632 , 2633 CASE 2634 WHEN table_parquet.INFO NOT IN ('','.') 2635 THEN table_parquet.INFO 2636 ELSE '' 2637 END 2638 ) 2639 FROM vcf_df as table_parquet 2640 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2641 AND table_parquet.\"POS\" = table_variants.\"POS\" 2642 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2643 AND table_parquet.\"REF\" = table_variants.\"REF\" 2644 AND table_parquet.INFO NOT IN ('','.') 2645 ) 2646 ) 2647 ; 2648 """ 2649 self.conn.execute(sql_query_update) 2650 2651 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2652 """ 2653 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2654 table, then updates the INFO column of the variants table with the INFO column of the temporary 2655 table 2656 2657 :param vcf_file: The path to the VCF file you want to update the database with 2658 """ 2659 2660 # Create a temporary table for the VCF 2661 table_vcf = "tmp_vcf" 2662 sql_create = ( 2663 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2664 ) 2665 self.conn.execute(sql_create) 2666 2667 # Loading VCF into temporaire table 2668 vcf_df = pd.read_csv( 2669 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2670 ) 2671 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2672 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2673 2674 # Update table 'variants' with VCF data 2675 # warning: CONCAT as || operator 2676 sql_query_update = f""" 2677 UPDATE variants as table_variants 2678 SET INFO = CASE 2679 WHEN INFO NOT IN ('', '.') 2680 THEN INFO 2681 ELSE '' 2682 END || 2683 ( 2684 SELECT 2685 CASE 2686 WHEN table_variants.INFO NOT IN ('','.') 2687 AND table_vcf.INFO NOT IN ('','.') 2688 THEN ';' 2689 ELSE '' 2690 END || 2691 CASE 2692 WHEN table_vcf.INFO NOT IN ('','.') 2693 THEN table_vcf.INFO 2694 ELSE '' 2695 END 2696 FROM {table_vcf} as table_vcf 2697 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2698 AND table_vcf.\"POS\" = table_variants.\"POS\" 2699 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2700 AND table_vcf.\"REF\" = table_variants.\"REF\" 2701 ) 2702 """ 2703 self.conn.execute(sql_query_update) 2704 2705 # Drop temporary table 2706 sql_drop = f"DROP TABLE {table_vcf}" 2707 self.conn.execute(sql_drop) 2708 2709 def drop_variants_table(self) -> None: 2710 """ 2711 > This function drops the variants table 2712 """ 2713 2714 table_variants = self.get_table_variants() 2715 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2716 self.conn.execute(sql_table_variants) 2717 2718 def set_variant_id( 2719 self, variant_id_column: str = "variant_id", force: bool = None 2720 ) -> str: 2721 """ 2722 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2723 `#CHROM`, `POS`, `REF`, and `ALT` columns 2724 2725 :param variant_id_column: The name of the column to be created in the variants table, defaults 2726 to variant_id 2727 :type variant_id_column: str (optional) 2728 :param force: If True, the variant_id column will be created even if it already exists 2729 :type force: bool 2730 :return: The name of the column that contains the variant_id 2731 """ 2732 2733 # Assembly 2734 assembly = self.get_param().get( 2735 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2736 ) 2737 2738 # INFO/Tag prefix 2739 prefix = self.get_explode_infos_prefix() 2740 2741 # Explode INFO/SVTYPE 2742 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2743 2744 # variants table 2745 table_variants = self.get_table_variants() 2746 2747 # variant_id column 2748 if not variant_id_column: 2749 variant_id_column = "variant_id" 2750 2751 # Creta variant_id column 2752 if "variant_id" not in self.get_extra_infos() or force: 2753 2754 # Create column 2755 self.add_column( 2756 table_name=table_variants, 2757 column_name=variant_id_column, 2758 column_type="UBIGINT", 2759 default_value="0", 2760 ) 2761 2762 # Update column 2763 self.conn.execute( 2764 f""" 2765 UPDATE {table_variants} 2766 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2767 """ 2768 ) 2769 2770 # Remove added columns 2771 for added_column in added_columns: 2772 self.drop_column(column=added_column) 2773 2774 # return variant_id column name 2775 return variant_id_column 2776 2777 def get_variant_id_column( 2778 self, variant_id_column: str = "variant_id", force: bool = None 2779 ) -> str: 2780 """ 2781 This function returns the variant_id column name 2782 2783 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2784 defaults to variant_id 2785 :type variant_id_column: str (optional) 2786 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2787 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2788 if it is not already set, or if it is set 2789 :type force: bool 2790 :return: The variant_id column name. 2791 """ 2792 2793 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2794 2795 ### 2796 # Annotation 2797 ### 2798 2799 def scan_databases( 2800 self, 2801 database_formats: list = ["parquet"], 2802 database_releases: list = ["current"], 2803 ) -> dict: 2804 """ 2805 The function `scan_databases` scans for available databases based on specified formats and 2806 releases. 2807 2808 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2809 of the databases to be scanned. In this case, the accepted format is "parquet" 2810 :type database_formats: list ["parquet"] 2811 :param database_releases: The `database_releases` parameter is a list that specifies the 2812 releases of the databases to be scanned. In the provided function, the default value for 2813 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2814 databases that are in the "current" 2815 :type database_releases: list 2816 :return: The function `scan_databases` returns a dictionary containing information about 2817 databases that match the specified formats and releases. 2818 """ 2819 2820 # Config 2821 config = self.get_config() 2822 2823 # Param 2824 param = self.get_param() 2825 2826 # Param - Assembly 2827 assembly = param.get("assembly", config.get("assembly", None)) 2828 if not assembly: 2829 assembly = DEFAULT_ASSEMBLY 2830 log.warning(f"Default assembly '{assembly}'") 2831 2832 # Scan for availabled databases 2833 log.info( 2834 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2835 ) 2836 databases_infos_dict = databases_infos( 2837 database_folder_releases=database_releases, 2838 database_formats=database_formats, 2839 assembly=assembly, 2840 config=config, 2841 ) 2842 log.info( 2843 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2844 ) 2845 2846 return databases_infos_dict 2847 2848 def annotation(self) -> None: 2849 """ 2850 It annotates the VCF file with the annotations specified in the config file. 2851 """ 2852 2853 # Config 2854 config = self.get_config() 2855 2856 # Param 2857 param = self.get_param() 2858 2859 # Param - Assembly 2860 assembly = param.get("assembly", config.get("assembly", None)) 2861 if not assembly: 2862 assembly = DEFAULT_ASSEMBLY 2863 log.warning(f"Default assembly '{assembly}'") 2864 2865 # annotations databases folders 2866 annotations_databases = set( 2867 config.get("folders", {}) 2868 .get("databases", {}) 2869 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2870 + config.get("folders", {}) 2871 .get("databases", {}) 2872 .get("parquet", ["~/howard/databases/parquet/current"]) 2873 + config.get("folders", {}) 2874 .get("databases", {}) 2875 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2876 ) 2877 2878 # Get param annotations 2879 if param.get("annotations", None) and isinstance( 2880 param.get("annotations", None), str 2881 ): 2882 log.debug(param.get("annotations", None)) 2883 param_annotation_list = param.get("annotations").split(",") 2884 else: 2885 param_annotation_list = [] 2886 2887 # Each tools param 2888 if param.get("annotation_parquet", None) != None: 2889 log.debug( 2890 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2891 ) 2892 if isinstance(param.get("annotation_parquet", None), list): 2893 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2894 else: 2895 param_annotation_list.append(param.get("annotation_parquet")) 2896 if param.get("annotation_snpsift", None) != None: 2897 if isinstance(param.get("annotation_snpsift", None), list): 2898 param_annotation_list.append( 2899 "snpsift:" 2900 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2901 ) 2902 else: 2903 param_annotation_list.append( 2904 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2905 ) 2906 if param.get("annotation_snpeff", None) != None: 2907 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2908 if param.get("annotation_bcftools", None) != None: 2909 if isinstance(param.get("annotation_bcftools", None), list): 2910 param_annotation_list.append( 2911 "bcftools:" 2912 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2913 ) 2914 else: 2915 param_annotation_list.append( 2916 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2917 ) 2918 if param.get("annotation_annovar", None) != None: 2919 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2920 if param.get("annotation_exomiser", None) != None: 2921 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2922 if param.get("annotation_splice", None) != None: 2923 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2924 2925 # Merge param annotations list 2926 param["annotations"] = ",".join(param_annotation_list) 2927 2928 # debug 2929 log.debug(f"param_annotations={param['annotations']}") 2930 2931 if param.get("annotations"): 2932 2933 # Log 2934 # log.info("Annotations - Check annotation parameters") 2935 2936 if not "annotation" in param: 2937 param["annotation"] = {} 2938 2939 # List of annotations parameters 2940 annotations_list_input = {} 2941 if isinstance(param.get("annotations", None), str): 2942 annotation_file_list = [ 2943 value for value in param.get("annotations", "").split(",") 2944 ] 2945 for annotation_file in annotation_file_list: 2946 annotations_list_input[annotation_file] = {"INFO": None} 2947 else: 2948 annotations_list_input = param.get("annotations", {}) 2949 2950 log.info(f"Quick Annotations:") 2951 for annotation_key in list(annotations_list_input.keys()): 2952 log.info(f" {annotation_key}") 2953 2954 # List of annotations and associated fields 2955 annotations_list = {} 2956 2957 for annotation_file in annotations_list_input: 2958 2959 # Explode annotations if ALL 2960 if ( 2961 annotation_file.upper() == "ALL" 2962 or annotation_file.upper().startswith("ALL:") 2963 ): 2964 2965 # check ALL parameters (formats, releases) 2966 annotation_file_split = annotation_file.split(":") 2967 database_formats = "parquet" 2968 database_releases = "current" 2969 for annotation_file_option in annotation_file_split[1:]: 2970 database_all_options_split = annotation_file_option.split("=") 2971 if database_all_options_split[0] == "format": 2972 database_formats = database_all_options_split[1].split("+") 2973 if database_all_options_split[0] == "release": 2974 database_releases = database_all_options_split[1].split("+") 2975 2976 # Scan for availabled databases 2977 databases_infos_dict = self.scan_databases( 2978 database_formats=database_formats, 2979 database_releases=database_releases, 2980 ) 2981 2982 # Add found databases in annotation parameters 2983 for database_infos in databases_infos_dict.keys(): 2984 annotations_list[database_infos] = {"INFO": None} 2985 2986 else: 2987 annotations_list[annotation_file] = annotations_list_input[ 2988 annotation_file 2989 ] 2990 2991 # Check each databases 2992 if len(annotations_list): 2993 2994 log.info( 2995 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2996 ) 2997 2998 for annotation_file in annotations_list: 2999 3000 # Init 3001 annotations = annotations_list.get(annotation_file, None) 3002 3003 # Annotation snpEff 3004 if annotation_file.startswith("snpeff"): 3005 3006 log.debug(f"Quick Annotation snpEff") 3007 3008 if "snpeff" not in param["annotation"]: 3009 param["annotation"]["snpeff"] = {} 3010 3011 if "options" not in param["annotation"]["snpeff"]: 3012 param["annotation"]["snpeff"]["options"] = "" 3013 3014 # snpEff options in annotations 3015 param["annotation"]["snpeff"]["options"] = "".join( 3016 annotation_file.split(":")[1:] 3017 ) 3018 3019 # Annotation Annovar 3020 elif annotation_file.startswith("annovar"): 3021 3022 log.debug(f"Quick Annotation Annovar") 3023 3024 if "annovar" not in param["annotation"]: 3025 param["annotation"]["annovar"] = {} 3026 3027 if "annotations" not in param["annotation"]["annovar"]: 3028 param["annotation"]["annovar"]["annotations"] = {} 3029 3030 # Options 3031 annotation_file_split = annotation_file.split(":") 3032 for annotation_file_annotation in annotation_file_split[1:]: 3033 if annotation_file_annotation: 3034 param["annotation"]["annovar"]["annotations"][ 3035 annotation_file_annotation 3036 ] = annotations 3037 3038 # Annotation Exomiser 3039 elif annotation_file.startswith("exomiser"): 3040 3041 log.debug(f"Quick Annotation Exomiser") 3042 3043 param["annotation"]["exomiser"] = params_string_to_dict( 3044 annotation_file 3045 ) 3046 3047 # Annotation Splice 3048 elif annotation_file.startswith("splice"): 3049 3050 log.debug(f"Quick Annotation Splice") 3051 3052 param["annotation"]["splice"] = params_string_to_dict( 3053 annotation_file 3054 ) 3055 3056 # Annotation Parquet or BCFTOOLS 3057 else: 3058 3059 # Tools detection 3060 if annotation_file.startswith("bcftools:"): 3061 annotation_tool_initial = "bcftools" 3062 annotation_file = ":".join(annotation_file.split(":")[1:]) 3063 elif annotation_file.startswith("snpsift:"): 3064 annotation_tool_initial = "snpsift" 3065 annotation_file = ":".join(annotation_file.split(":")[1:]) 3066 else: 3067 annotation_tool_initial = None 3068 3069 # list of files 3070 annotation_file_list = annotation_file.replace("+", ":").split( 3071 ":" 3072 ) 3073 3074 for annotation_file in annotation_file_list: 3075 3076 if annotation_file: 3077 3078 # Annotation tool initial 3079 annotation_tool = annotation_tool_initial 3080 3081 # Find file 3082 annotation_file_found = None 3083 3084 # Expand user 3085 annotation_file = full_path(annotation_file) 3086 3087 if os.path.exists(annotation_file): 3088 annotation_file_found = annotation_file 3089 3090 else: 3091 # Find within assembly folders 3092 for annotations_database in annotations_databases: 3093 found_files = find_all( 3094 annotation_file, 3095 os.path.join( 3096 annotations_database, assembly 3097 ), 3098 ) 3099 if len(found_files) > 0: 3100 annotation_file_found = found_files[0] 3101 break 3102 if not annotation_file_found and not assembly: 3103 # Find within folders 3104 for ( 3105 annotations_database 3106 ) in annotations_databases: 3107 found_files = find_all( 3108 annotation_file, annotations_database 3109 ) 3110 if len(found_files) > 0: 3111 annotation_file_found = found_files[0] 3112 break 3113 log.debug( 3114 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3115 ) 3116 3117 # Full path 3118 annotation_file_found = full_path(annotation_file_found) 3119 3120 if annotation_file_found: 3121 3122 database = Database(database=annotation_file_found) 3123 quick_annotation_format = database.get_format() 3124 quick_annotation_is_compressed = ( 3125 database.is_compressed() 3126 ) 3127 quick_annotation_is_indexed = os.path.exists( 3128 f"{annotation_file_found}.tbi" 3129 ) 3130 bcftools_preference = False 3131 3132 # Check Annotation Tool 3133 if not annotation_tool: 3134 if ( 3135 bcftools_preference 3136 and quick_annotation_format 3137 in ["vcf", "bed"] 3138 and quick_annotation_is_compressed 3139 and quick_annotation_is_indexed 3140 ): 3141 annotation_tool = "bcftools" 3142 elif quick_annotation_format in [ 3143 "vcf", 3144 "bed", 3145 "tsv", 3146 "tsv", 3147 "csv", 3148 "json", 3149 "tbl", 3150 "parquet", 3151 "duckdb", 3152 ]: 3153 annotation_tool = "parquet" 3154 else: 3155 log.error( 3156 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3157 ) 3158 raise ValueError( 3159 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3160 ) 3161 3162 log.debug( 3163 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3164 ) 3165 3166 # Annotation Tool dispatch 3167 if annotation_tool: 3168 if annotation_tool not in param["annotation"]: 3169 param["annotation"][annotation_tool] = {} 3170 if ( 3171 "annotations" 3172 not in param["annotation"][annotation_tool] 3173 ): 3174 param["annotation"][annotation_tool][ 3175 "annotations" 3176 ] = {} 3177 param["annotation"][annotation_tool][ 3178 "annotations" 3179 ][annotation_file_found] = annotations 3180 3181 else: 3182 log.error( 3183 f"Quick Annotation File {annotation_file} does NOT exist" 3184 ) 3185 3186 self.set_param(param) 3187 3188 if param.get("annotation", None): 3189 log.info("Annotations") 3190 if param.get("annotation", {}).get("parquet", None): 3191 log.info("Annotations 'parquet'...") 3192 self.annotation_parquet() 3193 if param.get("annotation", {}).get("bcftools", None): 3194 log.info("Annotations 'bcftools'...") 3195 self.annotation_bcftools() 3196 if param.get("annotation", {}).get("snpsift", None): 3197 log.info("Annotations 'snpsift'...") 3198 self.annotation_snpsift() 3199 if param.get("annotation", {}).get("annovar", None): 3200 log.info("Annotations 'annovar'...") 3201 self.annotation_annovar() 3202 if param.get("annotation", {}).get("snpeff", None): 3203 log.info("Annotations 'snpeff'...") 3204 self.annotation_snpeff() 3205 if param.get("annotation", {}).get("exomiser", None) is not None: 3206 log.info("Annotations 'exomiser'...") 3207 self.annotation_exomiser() 3208 if param.get("annotation", {}).get("splice", None) is not None: 3209 log.info("Annotations 'splice' ...") 3210 self.annotation_splice() 3211 3212 # Explode INFOS fields into table fields 3213 if self.get_explode_infos(): 3214 self.explode_infos( 3215 prefix=self.get_explode_infos_prefix(), 3216 fields=self.get_explode_infos_fields(), 3217 force=True, 3218 ) 3219 3220 def annotation_snpsift(self, threads: int = None) -> None: 3221 """ 3222 This function annotate with bcftools 3223 3224 :param threads: Number of threads to use 3225 :return: the value of the variable "return_value". 3226 """ 3227 3228 # DEBUG 3229 log.debug("Start annotation with bcftools databases") 3230 3231 # Threads 3232 if not threads: 3233 threads = self.get_threads() 3234 log.debug("Threads: " + str(threads)) 3235 3236 # Config 3237 config = self.get_config() 3238 log.debug("Config: " + str(config)) 3239 3240 # Config - snpSift 3241 snpsift_bin_command = get_bin_command( 3242 bin="SnpSift.jar", 3243 tool="snpsift", 3244 bin_type="jar", 3245 config=config, 3246 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3247 ) 3248 if not snpsift_bin_command: 3249 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3250 log.error(msg_err) 3251 raise ValueError(msg_err) 3252 3253 # Config - bcftools 3254 bcftools_bin_command = get_bin_command( 3255 bin="bcftools", 3256 tool="bcftools", 3257 bin_type="bin", 3258 config=config, 3259 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3260 ) 3261 if not bcftools_bin_command: 3262 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3263 log.error(msg_err) 3264 raise ValueError(msg_err) 3265 3266 # Config - BCFTools databases folders 3267 databases_folders = set( 3268 self.get_config() 3269 .get("folders", {}) 3270 .get("databases", {}) 3271 .get("annotations", ["."]) 3272 + self.get_config() 3273 .get("folders", {}) 3274 .get("databases", {}) 3275 .get("bcftools", ["."]) 3276 ) 3277 log.debug("Databases annotations: " + str(databases_folders)) 3278 3279 # Param 3280 annotations = ( 3281 self.get_param() 3282 .get("annotation", {}) 3283 .get("snpsift", {}) 3284 .get("annotations", None) 3285 ) 3286 log.debug("Annotations: " + str(annotations)) 3287 3288 # Assembly 3289 assembly = self.get_param().get( 3290 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3291 ) 3292 3293 # Data 3294 table_variants = self.get_table_variants() 3295 3296 # Check if not empty 3297 log.debug("Check if not empty") 3298 sql_query_chromosomes = ( 3299 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3300 ) 3301 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3302 if not sql_query_chromosomes_df["count"][0]: 3303 log.info(f"VCF empty") 3304 return 3305 3306 # VCF header 3307 vcf_reader = self.get_header() 3308 log.debug("Initial header: " + str(vcf_reader.infos)) 3309 3310 # Existing annotations 3311 for vcf_annotation in self.get_header().infos: 3312 3313 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3314 log.debug( 3315 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3316 ) 3317 3318 if annotations: 3319 3320 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3321 3322 # Export VCF file 3323 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3324 3325 # Init 3326 commands = {} 3327 3328 for annotation in annotations: 3329 annotation_fields = annotations[annotation] 3330 3331 # Annotation Name 3332 annotation_name = os.path.basename(annotation) 3333 3334 if not annotation_fields: 3335 annotation_fields = {"INFO": None} 3336 3337 log.debug(f"Annotation '{annotation_name}'") 3338 log.debug( 3339 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3340 ) 3341 3342 # Create Database 3343 database = Database( 3344 database=annotation, 3345 databases_folders=databases_folders, 3346 assembly=assembly, 3347 ) 3348 3349 # Find files 3350 db_file = database.get_database() 3351 db_file = full_path(db_file) 3352 db_hdr_file = database.get_header_file() 3353 db_hdr_file = full_path(db_hdr_file) 3354 db_file_type = database.get_format() 3355 db_tbi_file = f"{db_file}.tbi" 3356 db_file_compressed = database.is_compressed() 3357 3358 # Check if compressed 3359 if not db_file_compressed: 3360 log.error( 3361 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3362 ) 3363 raise ValueError( 3364 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3365 ) 3366 3367 # Check if indexed 3368 if not os.path.exists(db_tbi_file): 3369 log.error( 3370 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3371 ) 3372 raise ValueError( 3373 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3374 ) 3375 3376 # Check index - try to create if not exists 3377 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3378 log.error("Annotation failed: database not valid") 3379 log.error(f"Annotation annotation file: {db_file}") 3380 log.error(f"Annotation annotation header: {db_hdr_file}") 3381 log.error(f"Annotation annotation index: {db_tbi_file}") 3382 raise ValueError( 3383 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3384 ) 3385 else: 3386 3387 log.debug( 3388 f"Annotation '{annotation}' - file: " 3389 + str(db_file) 3390 + " and " 3391 + str(db_hdr_file) 3392 ) 3393 3394 # Load header as VCF object 3395 db_hdr_vcf = Variants(input=db_hdr_file) 3396 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3397 log.debug( 3398 "Annotation database header: " 3399 + str(db_hdr_vcf_header_infos) 3400 ) 3401 3402 # For all fields in database 3403 annotation_fields_full = False 3404 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3405 annotation_fields = { 3406 key: key for key in db_hdr_vcf_header_infos 3407 } 3408 log.debug( 3409 "Annotation database header - All annotations added: " 3410 + str(annotation_fields) 3411 ) 3412 annotation_fields_full = True 3413 3414 # # Create file for field rename 3415 # log.debug("Create file for field rename") 3416 # tmp_rename = NamedTemporaryFile( 3417 # prefix=self.get_prefix(), 3418 # dir=self.get_tmp_dir(), 3419 # suffix=".rename", 3420 # delete=False, 3421 # ) 3422 # tmp_rename_name = tmp_rename.name 3423 # tmp_files.append(tmp_rename_name) 3424 3425 # Number of fields 3426 nb_annotation_field = 0 3427 annotation_list = [] 3428 annotation_infos_rename_list = [] 3429 3430 for annotation_field in annotation_fields: 3431 3432 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3433 annotation_fields_new_name = annotation_fields.get( 3434 annotation_field, annotation_field 3435 ) 3436 if not annotation_fields_new_name: 3437 annotation_fields_new_name = annotation_field 3438 3439 # Check if field is in DB and if field is not elready in input data 3440 if ( 3441 annotation_field in db_hdr_vcf.get_header().infos 3442 and annotation_fields_new_name 3443 not in self.get_header().infos 3444 ): 3445 3446 log.info( 3447 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3448 ) 3449 3450 # BCFTools annotate param to rename fields 3451 if annotation_field != annotation_fields_new_name: 3452 annotation_infos_rename_list.append( 3453 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3454 ) 3455 3456 # Add INFO field to header 3457 db_hdr_vcf_header_infos_number = ( 3458 db_hdr_vcf_header_infos[annotation_field].num or "." 3459 ) 3460 db_hdr_vcf_header_infos_type = ( 3461 db_hdr_vcf_header_infos[annotation_field].type 3462 or "String" 3463 ) 3464 db_hdr_vcf_header_infos_description = ( 3465 db_hdr_vcf_header_infos[annotation_field].desc 3466 or f"{annotation_field} description" 3467 ) 3468 db_hdr_vcf_header_infos_source = ( 3469 db_hdr_vcf_header_infos[annotation_field].source 3470 or "unknown" 3471 ) 3472 db_hdr_vcf_header_infos_version = ( 3473 db_hdr_vcf_header_infos[annotation_field].version 3474 or "unknown" 3475 ) 3476 3477 vcf_reader.infos[annotation_fields_new_name] = ( 3478 vcf.parser._Info( 3479 annotation_fields_new_name, 3480 db_hdr_vcf_header_infos_number, 3481 db_hdr_vcf_header_infos_type, 3482 db_hdr_vcf_header_infos_description, 3483 db_hdr_vcf_header_infos_source, 3484 db_hdr_vcf_header_infos_version, 3485 self.code_type_map[ 3486 db_hdr_vcf_header_infos_type 3487 ], 3488 ) 3489 ) 3490 3491 annotation_list.append(annotation_field) 3492 3493 nb_annotation_field += 1 3494 3495 else: 3496 3497 if ( 3498 annotation_field 3499 not in db_hdr_vcf.get_header().infos 3500 ): 3501 log.warning( 3502 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3503 ) 3504 if ( 3505 annotation_fields_new_name 3506 in self.get_header().infos 3507 ): 3508 log.warning( 3509 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3510 ) 3511 3512 log.info( 3513 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3514 ) 3515 3516 annotation_infos = ",".join(annotation_list) 3517 3518 if annotation_infos != "": 3519 3520 # Annotated VCF (and error file) 3521 tmp_annotation_vcf_name = os.path.join( 3522 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3523 ) 3524 tmp_annotation_vcf_name_err = ( 3525 tmp_annotation_vcf_name + ".err" 3526 ) 3527 3528 # Add fields to annotate 3529 if not annotation_fields_full: 3530 annotation_infos_option = f"-info {annotation_infos}" 3531 else: 3532 annotation_infos_option = "" 3533 3534 # Info fields rename 3535 if annotation_infos_rename_list: 3536 annotation_infos_rename = " -c " + ",".join( 3537 annotation_infos_rename_list 3538 ) 3539 else: 3540 annotation_infos_rename = "" 3541 3542 # Annotate command 3543 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3544 3545 # Add command 3546 commands[command_annotate] = tmp_annotation_vcf_name 3547 3548 if commands: 3549 3550 # Export VCF file 3551 self.export_variant_vcf( 3552 vcf_file=tmp_vcf_name, 3553 remove_info=True, 3554 add_samples=False, 3555 index=True, 3556 ) 3557 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3558 3559 # Num command 3560 nb_command = 0 3561 3562 # Annotate 3563 for command_annotate in commands: 3564 nb_command += 1 3565 log.info( 3566 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3567 ) 3568 log.debug(f"command_annotate={command_annotate}") 3569 run_parallel_commands([command_annotate], threads) 3570 3571 # Debug 3572 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3573 3574 # Update variants 3575 log.info( 3576 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3577 ) 3578 self.update_from_vcf(commands[command_annotate]) 3579 3580 def annotation_bcftools(self, threads: int = None) -> None: 3581 """ 3582 This function annotate with bcftools 3583 3584 :param threads: Number of threads to use 3585 :return: the value of the variable "return_value". 3586 """ 3587 3588 # DEBUG 3589 log.debug("Start annotation with bcftools databases") 3590 3591 # Threads 3592 if not threads: 3593 threads = self.get_threads() 3594 log.debug("Threads: " + str(threads)) 3595 3596 # Config 3597 config = self.get_config() 3598 log.debug("Config: " + str(config)) 3599 3600 # DEBUG 3601 delete_tmp = True 3602 if self.get_config().get("verbosity", "warning") in ["debug"]: 3603 delete_tmp = False 3604 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3605 3606 # Config - BCFTools bin command 3607 bcftools_bin_command = get_bin_command( 3608 bin="bcftools", 3609 tool="bcftools", 3610 bin_type="bin", 3611 config=config, 3612 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3613 ) 3614 if not bcftools_bin_command: 3615 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3616 log.error(msg_err) 3617 raise ValueError(msg_err) 3618 3619 # Config - BCFTools databases folders 3620 databases_folders = set( 3621 self.get_config() 3622 .get("folders", {}) 3623 .get("databases", {}) 3624 .get("annotations", ["."]) 3625 + self.get_config() 3626 .get("folders", {}) 3627 .get("databases", {}) 3628 .get("bcftools", ["."]) 3629 ) 3630 log.debug("Databases annotations: " + str(databases_folders)) 3631 3632 # Param 3633 annotations = ( 3634 self.get_param() 3635 .get("annotation", {}) 3636 .get("bcftools", {}) 3637 .get("annotations", None) 3638 ) 3639 log.debug("Annotations: " + str(annotations)) 3640 3641 # Assembly 3642 assembly = self.get_param().get( 3643 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3644 ) 3645 3646 # Data 3647 table_variants = self.get_table_variants() 3648 3649 # Check if not empty 3650 log.debug("Check if not empty") 3651 sql_query_chromosomes = ( 3652 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3653 ) 3654 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3655 if not sql_query_chromosomes_df["count"][0]: 3656 log.info(f"VCF empty") 3657 return 3658 3659 # Export in VCF 3660 log.debug("Create initial file to annotate") 3661 tmp_vcf = NamedTemporaryFile( 3662 prefix=self.get_prefix(), 3663 dir=self.get_tmp_dir(), 3664 suffix=".vcf.gz", 3665 delete=False, 3666 ) 3667 tmp_vcf_name = tmp_vcf.name 3668 3669 # VCF header 3670 vcf_reader = self.get_header() 3671 log.debug("Initial header: " + str(vcf_reader.infos)) 3672 3673 # Existing annotations 3674 for vcf_annotation in self.get_header().infos: 3675 3676 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3677 log.debug( 3678 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3679 ) 3680 3681 if annotations: 3682 3683 tmp_ann_vcf_list = [] 3684 commands = [] 3685 tmp_files = [] 3686 err_files = [] 3687 3688 for annotation in annotations: 3689 annotation_fields = annotations[annotation] 3690 3691 # Annotation Name 3692 annotation_name = os.path.basename(annotation) 3693 3694 if not annotation_fields: 3695 annotation_fields = {"INFO": None} 3696 3697 log.debug(f"Annotation '{annotation_name}'") 3698 log.debug( 3699 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3700 ) 3701 3702 # Create Database 3703 database = Database( 3704 database=annotation, 3705 databases_folders=databases_folders, 3706 assembly=assembly, 3707 ) 3708 3709 # Find files 3710 db_file = database.get_database() 3711 db_file = full_path(db_file) 3712 db_hdr_file = database.get_header_file() 3713 db_hdr_file = full_path(db_hdr_file) 3714 db_file_type = database.get_format() 3715 db_tbi_file = f"{db_file}.tbi" 3716 db_file_compressed = database.is_compressed() 3717 3718 # Check if compressed 3719 if not db_file_compressed: 3720 log.error( 3721 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3722 ) 3723 raise ValueError( 3724 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3725 ) 3726 3727 # Check if indexed 3728 if not os.path.exists(db_tbi_file): 3729 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3730 raise ValueError( 3731 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3732 ) 3733 3734 # Check index - try to create if not exists 3735 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3736 log.error("Annotation failed: database not valid") 3737 log.error(f"Annotation annotation file: {db_file}") 3738 log.error(f"Annotation annotation header: {db_hdr_file}") 3739 log.error(f"Annotation annotation index: {db_tbi_file}") 3740 raise ValueError( 3741 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3742 ) 3743 else: 3744 3745 log.debug( 3746 f"Annotation '{annotation}' - file: " 3747 + str(db_file) 3748 + " and " 3749 + str(db_hdr_file) 3750 ) 3751 3752 # Load header as VCF object 3753 db_hdr_vcf = Variants(input=db_hdr_file) 3754 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3755 log.debug( 3756 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3757 ) 3758 3759 # For all fields in database 3760 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3761 annotation_fields = { 3762 key: key for key in db_hdr_vcf_header_infos 3763 } 3764 log.debug( 3765 "Annotation database header - All annotations added: " 3766 + str(annotation_fields) 3767 ) 3768 3769 # Number of fields 3770 nb_annotation_field = 0 3771 annotation_list = [] 3772 3773 for annotation_field in annotation_fields: 3774 3775 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3776 annotation_fields_new_name = annotation_fields.get( 3777 annotation_field, annotation_field 3778 ) 3779 if not annotation_fields_new_name: 3780 annotation_fields_new_name = annotation_field 3781 3782 # Check if field is in DB and if field is not elready in input data 3783 if ( 3784 annotation_field in db_hdr_vcf.get_header().infos 3785 and annotation_fields_new_name 3786 not in self.get_header().infos 3787 ): 3788 3789 log.info( 3790 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3791 ) 3792 3793 # Add INFO field to header 3794 db_hdr_vcf_header_infos_number = ( 3795 db_hdr_vcf_header_infos[annotation_field].num or "." 3796 ) 3797 db_hdr_vcf_header_infos_type = ( 3798 db_hdr_vcf_header_infos[annotation_field].type 3799 or "String" 3800 ) 3801 db_hdr_vcf_header_infos_description = ( 3802 db_hdr_vcf_header_infos[annotation_field].desc 3803 or f"{annotation_field} description" 3804 ) 3805 db_hdr_vcf_header_infos_source = ( 3806 db_hdr_vcf_header_infos[annotation_field].source 3807 or "unknown" 3808 ) 3809 db_hdr_vcf_header_infos_version = ( 3810 db_hdr_vcf_header_infos[annotation_field].version 3811 or "unknown" 3812 ) 3813 3814 vcf_reader.infos[annotation_fields_new_name] = ( 3815 vcf.parser._Info( 3816 annotation_fields_new_name, 3817 db_hdr_vcf_header_infos_number, 3818 db_hdr_vcf_header_infos_type, 3819 db_hdr_vcf_header_infos_description, 3820 db_hdr_vcf_header_infos_source, 3821 db_hdr_vcf_header_infos_version, 3822 self.code_type_map[db_hdr_vcf_header_infos_type], 3823 ) 3824 ) 3825 3826 # annotation_list.append(annotation_field) 3827 if annotation_field != annotation_fields_new_name: 3828 annotation_list.append( 3829 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3830 ) 3831 else: 3832 annotation_list.append(annotation_field) 3833 3834 nb_annotation_field += 1 3835 3836 else: 3837 3838 if annotation_field not in db_hdr_vcf.get_header().infos: 3839 log.warning( 3840 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3841 ) 3842 if annotation_fields_new_name in self.get_header().infos: 3843 log.warning( 3844 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3845 ) 3846 3847 log.info( 3848 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3849 ) 3850 3851 annotation_infos = ",".join(annotation_list) 3852 3853 if annotation_infos != "": 3854 3855 # Protect header for bcftools (remove "#CHROM" and variants line) 3856 log.debug("Protect Header file - remove #CHROM line if exists") 3857 tmp_header_vcf = NamedTemporaryFile( 3858 prefix=self.get_prefix(), 3859 dir=self.get_tmp_dir(), 3860 suffix=".hdr", 3861 delete=False, 3862 ) 3863 tmp_header_vcf_name = tmp_header_vcf.name 3864 tmp_files.append(tmp_header_vcf_name) 3865 # Command 3866 if db_hdr_file.endswith(".gz"): 3867 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3868 else: 3869 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3870 # Run 3871 run_parallel_commands([command_extract_header], 1) 3872 3873 # Find chomosomes 3874 log.debug("Find chromosomes ") 3875 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3876 sql_query_chromosomes_df = self.get_query_to_df( 3877 sql_query_chromosomes 3878 ) 3879 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3880 3881 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3882 3883 # BED columns in the annotation file 3884 if db_file_type in ["bed"]: 3885 annotation_infos = "CHROM,POS,POS," + annotation_infos 3886 3887 for chrom in chomosomes_list: 3888 3889 # Create BED on initial VCF 3890 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3891 tmp_bed = NamedTemporaryFile( 3892 prefix=self.get_prefix(), 3893 dir=self.get_tmp_dir(), 3894 suffix=".bed", 3895 delete=False, 3896 ) 3897 tmp_bed_name = tmp_bed.name 3898 tmp_files.append(tmp_bed_name) 3899 3900 # Detecte regions 3901 log.debug( 3902 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3903 ) 3904 window = 1000000 3905 sql_query_intervals_for_bed = f""" 3906 SELECT \"#CHROM\", 3907 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3908 \"POS\"+{window} 3909 FROM {table_variants} as table_variants 3910 WHERE table_variants.\"#CHROM\" = '{chrom}' 3911 """ 3912 regions = self.conn.execute( 3913 sql_query_intervals_for_bed 3914 ).fetchall() 3915 merged_regions = merge_regions(regions) 3916 log.debug( 3917 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3918 ) 3919 3920 header = ["#CHROM", "START", "END"] 3921 with open(tmp_bed_name, "w") as f: 3922 # Write the header with tab delimiter 3923 f.write("\t".join(header) + "\n") 3924 for d in merged_regions: 3925 # Write each data row with tab delimiter 3926 f.write("\t".join(map(str, d)) + "\n") 3927 3928 # Tmp files 3929 tmp_annotation_vcf = NamedTemporaryFile( 3930 prefix=self.get_prefix(), 3931 dir=self.get_tmp_dir(), 3932 suffix=".vcf.gz", 3933 delete=False, 3934 ) 3935 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3936 tmp_files.append(tmp_annotation_vcf_name) 3937 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3938 tmp_annotation_vcf_name_err = ( 3939 tmp_annotation_vcf_name + ".err" 3940 ) 3941 err_files.append(tmp_annotation_vcf_name_err) 3942 3943 # Annotate Command 3944 log.debug( 3945 f"Annotation '{annotation}' - add bcftools command" 3946 ) 3947 3948 # Command 3949 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3950 3951 # Add command 3952 commands.append(command_annotate) 3953 3954 # if some commands 3955 if commands: 3956 3957 # Export VCF file 3958 self.export_variant_vcf( 3959 vcf_file=tmp_vcf_name, 3960 remove_info=True, 3961 add_samples=False, 3962 index=True, 3963 ) 3964 3965 # Threads 3966 # calculate threads for annotated commands 3967 if commands: 3968 threads_bcftools_annotate = round(threads / len(commands)) 3969 else: 3970 threads_bcftools_annotate = 1 3971 3972 if not threads_bcftools_annotate: 3973 threads_bcftools_annotate = 1 3974 3975 # Add threads option to bcftools commands 3976 if threads_bcftools_annotate > 1: 3977 commands_threaded = [] 3978 for command in commands: 3979 commands_threaded.append( 3980 command.replace( 3981 f"{bcftools_bin_command} annotate ", 3982 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3983 ) 3984 ) 3985 commands = commands_threaded 3986 3987 # Command annotation multithreading 3988 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3989 log.info( 3990 f"Annotation - Annotation multithreaded in " 3991 + str(len(commands)) 3992 + " commands" 3993 ) 3994 3995 run_parallel_commands(commands, threads) 3996 3997 # Merge 3998 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3999 4000 if tmp_ann_vcf_list_cmd: 4001 4002 # Tmp file 4003 tmp_annotate_vcf = NamedTemporaryFile( 4004 prefix=self.get_prefix(), 4005 dir=self.get_tmp_dir(), 4006 suffix=".vcf.gz", 4007 delete=True, 4008 ) 4009 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4010 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4011 err_files.append(tmp_annotate_vcf_name_err) 4012 4013 # Tmp file remove command 4014 tmp_files_remove_command = "" 4015 if tmp_files: 4016 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4017 4018 # Command merge 4019 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4020 log.info( 4021 f"Annotation - Annotation merging " 4022 + str(len(commands)) 4023 + " annotated files" 4024 ) 4025 log.debug(f"Annotation - merge command: {merge_command}") 4026 run_parallel_commands([merge_command], 1) 4027 4028 # Error messages 4029 log.info(f"Error/Warning messages:") 4030 error_message_command_all = [] 4031 error_message_command_warning = [] 4032 error_message_command_err = [] 4033 for err_file in err_files: 4034 with open(err_file, "r") as f: 4035 for line in f: 4036 message = line.strip() 4037 error_message_command_all.append(message) 4038 if line.startswith("[W::"): 4039 error_message_command_warning.append(message) 4040 if line.startswith("[E::"): 4041 error_message_command_err.append( 4042 f"{err_file}: " + message 4043 ) 4044 # log info 4045 for message in list( 4046 set(error_message_command_err + error_message_command_warning) 4047 ): 4048 log.info(f" {message}") 4049 # debug info 4050 for message in list(set(error_message_command_all)): 4051 log.debug(f" {message}") 4052 # failed 4053 if len(error_message_command_err): 4054 log.error("Annotation failed: Error in commands") 4055 raise ValueError("Annotation failed: Error in commands") 4056 4057 # Update variants 4058 log.info(f"Annotation - Updating...") 4059 self.update_from_vcf(tmp_annotate_vcf_name) 4060 4061 def annotation_exomiser(self, threads: int = None) -> None: 4062 """ 4063 This function annotate with Exomiser 4064 4065 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4066 - "analysis" (dict/file): 4067 Full analysis dictionnary parameters (see Exomiser docs). 4068 Either a dict, or a file in JSON or YAML format. 4069 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4070 Default : None 4071 - "preset" (string): 4072 Analysis preset (available in config folder). 4073 Used if no full "analysis" is provided. 4074 Default: "exome" 4075 - "phenopacket" (dict/file): 4076 Samples and phenotipic features parameters (see Exomiser docs). 4077 Either a dict, or a file in JSON or YAML format. 4078 Default: None 4079 - "subject" (dict): 4080 Sample parameters (see Exomiser docs). 4081 Example: 4082 "subject": 4083 { 4084 "id": "ISDBM322017", 4085 "sex": "FEMALE" 4086 } 4087 Default: None 4088 - "sample" (string): 4089 Sample name to construct "subject" section: 4090 "subject": 4091 { 4092 "id": "<sample>", 4093 "sex": "UNKNOWN_SEX" 4094 } 4095 Default: None 4096 - "phenotypicFeatures" (dict) 4097 Phenotypic features to construct "subject" section. 4098 Example: 4099 "phenotypicFeatures": 4100 [ 4101 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4102 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4103 ] 4104 - "hpo" (list) 4105 List of HPO ids as phenotypic features. 4106 Example: 4107 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4108 Default: [] 4109 - "outputOptions" (dict): 4110 Output options (see Exomiser docs). 4111 Default: 4112 "output_options" = 4113 { 4114 "outputContributingVariantsOnly": False, 4115 "numGenes": 0, 4116 "outputFormats": ["TSV_VARIANT", "VCF"] 4117 } 4118 - "transcript_source" (string): 4119 Transcript source (either "refseq", "ucsc", "ensembl") 4120 Default: "refseq" 4121 - "exomiser_to_info" (boolean): 4122 Add exomiser TSV file columns as INFO fields in VCF. 4123 Default: False 4124 - "release" (string): 4125 Exomise database release. 4126 If not exists, database release will be downloaded (take a while). 4127 Default: None (provided by application.properties configuration file) 4128 - "exomiser_application_properties" (file): 4129 Exomiser configuration file (see Exomiser docs). 4130 Useful to automatically download databases (especially for specific genome databases). 4131 4132 Notes: 4133 - If no sample in parameters, first sample in VCF will be chosen 4134 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4135 4136 :param threads: The number of threads to use 4137 :return: None. 4138 """ 4139 4140 # DEBUG 4141 log.debug("Start annotation with Exomiser databases") 4142 4143 # Threads 4144 if not threads: 4145 threads = self.get_threads() 4146 log.debug("Threads: " + str(threads)) 4147 4148 # Config 4149 config = self.get_config() 4150 log.debug("Config: " + str(config)) 4151 4152 # Config - Folders - Databases 4153 databases_folders = ( 4154 config.get("folders", {}) 4155 .get("databases", {}) 4156 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4157 ) 4158 databases_folders = full_path(databases_folders) 4159 if not os.path.exists(databases_folders): 4160 log.error(f"Databases annotations: {databases_folders} NOT found") 4161 log.debug("Databases annotations: " + str(databases_folders)) 4162 4163 # Config - Exomiser 4164 exomiser_bin_command = get_bin_command( 4165 bin="exomiser-cli*.jar", 4166 tool="exomiser", 4167 bin_type="jar", 4168 config=config, 4169 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4170 ) 4171 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4172 if not exomiser_bin_command: 4173 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4174 log.error(msg_err) 4175 raise ValueError(msg_err) 4176 4177 # Param 4178 param = self.get_param() 4179 log.debug("Param: " + str(param)) 4180 4181 # Param - Exomiser 4182 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4183 log.debug(f"Param Exomiser: {param_exomiser}") 4184 4185 # Param - Assembly 4186 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4187 log.debug("Assembly: " + str(assembly)) 4188 4189 # Data 4190 table_variants = self.get_table_variants() 4191 4192 # Check if not empty 4193 log.debug("Check if not empty") 4194 sql_query_chromosomes = ( 4195 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4196 ) 4197 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4198 log.info(f"VCF empty") 4199 return False 4200 4201 # VCF header 4202 vcf_reader = self.get_header() 4203 log.debug("Initial header: " + str(vcf_reader.infos)) 4204 4205 # Samples 4206 samples = self.get_header_sample_list() 4207 if not samples: 4208 log.error("No Samples in VCF") 4209 return False 4210 log.debug(f"Samples: {samples}") 4211 4212 # Memory limit 4213 memory_limit = self.get_memory("8G") 4214 log.debug(f"memory_limit: {memory_limit}") 4215 4216 # Exomiser java options 4217 exomiser_java_options = ( 4218 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4219 ) 4220 log.debug(f"Exomiser java options: {exomiser_java_options}") 4221 4222 # Download Exomiser (if not exists) 4223 exomiser_release = param_exomiser.get("release", None) 4224 exomiser_application_properties = param_exomiser.get( 4225 "exomiser_application_properties", None 4226 ) 4227 databases_download_exomiser( 4228 assemblies=[assembly], 4229 exomiser_folder=databases_folders, 4230 exomiser_release=exomiser_release, 4231 exomiser_phenotype_release=exomiser_release, 4232 exomiser_application_properties=exomiser_application_properties, 4233 ) 4234 4235 # Force annotation 4236 force_update_annotation = True 4237 4238 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4239 log.debug("Start annotation Exomiser") 4240 4241 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4242 4243 # tmp_dir = "/tmp/exomiser" 4244 4245 ### ANALYSIS ### 4246 ################ 4247 4248 # Create analysis.json through analysis dict 4249 # either analysis in param or by default 4250 # depending on preset exome/genome) 4251 4252 # Init analysis dict 4253 param_exomiser_analysis_dict = {} 4254 4255 # analysis from param 4256 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4257 param_exomiser_analysis = full_path(param_exomiser_analysis) 4258 4259 # If analysis in param -> load anlaysis json 4260 if param_exomiser_analysis: 4261 4262 # If param analysis is a file and exists 4263 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4264 param_exomiser_analysis 4265 ): 4266 # Load analysis file into analysis dict (either yaml or json) 4267 with open(param_exomiser_analysis) as json_file: 4268 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4269 4270 # If param analysis is a dict 4271 elif isinstance(param_exomiser_analysis, dict): 4272 # Load analysis dict into analysis dict (either yaml or json) 4273 param_exomiser_analysis_dict = param_exomiser_analysis 4274 4275 # Error analysis type 4276 else: 4277 log.error(f"Analysis type unknown. Check param file.") 4278 raise ValueError(f"Analysis type unknown. Check param file.") 4279 4280 # Case no input analysis config file/dict 4281 # Use preset (exome/genome) to open default config file 4282 if not param_exomiser_analysis_dict: 4283 4284 # default preset 4285 default_preset = "exome" 4286 4287 # Get param preset or default preset 4288 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4289 4290 # Try to find if preset is a file 4291 if os.path.exists(param_exomiser_preset): 4292 # Preset file is provided in full path 4293 param_exomiser_analysis_default_config_file = ( 4294 param_exomiser_preset 4295 ) 4296 # elif os.path.exists(full_path(param_exomiser_preset)): 4297 # # Preset file is provided in full path 4298 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4299 elif os.path.exists( 4300 os.path.join(folder_config, param_exomiser_preset) 4301 ): 4302 # Preset file is provided a basename in config folder (can be a path with subfolders) 4303 param_exomiser_analysis_default_config_file = os.path.join( 4304 folder_config, param_exomiser_preset 4305 ) 4306 else: 4307 # Construct preset file 4308 param_exomiser_analysis_default_config_file = os.path.join( 4309 folder_config, 4310 f"preset-{param_exomiser_preset}-analysis.json", 4311 ) 4312 4313 # If preset file exists 4314 param_exomiser_analysis_default_config_file = full_path( 4315 param_exomiser_analysis_default_config_file 4316 ) 4317 if os.path.exists(param_exomiser_analysis_default_config_file): 4318 # Load prest file into analysis dict (either yaml or json) 4319 with open( 4320 param_exomiser_analysis_default_config_file 4321 ) as json_file: 4322 # param_exomiser_analysis_dict[""] = json.load(json_file) 4323 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4324 json_file 4325 ) 4326 4327 # Error preset file 4328 else: 4329 log.error( 4330 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4331 ) 4332 raise ValueError( 4333 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4334 ) 4335 4336 # If no analysis dict created 4337 if not param_exomiser_analysis_dict: 4338 log.error(f"No analysis config") 4339 raise ValueError(f"No analysis config") 4340 4341 # Log 4342 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4343 4344 ### PHENOPACKET ### 4345 ################### 4346 4347 # If no PhenoPacket in analysis dict -> check in param 4348 if "phenopacket" not in param_exomiser_analysis_dict: 4349 4350 # If PhenoPacket in param -> load anlaysis json 4351 if param_exomiser.get("phenopacket", None): 4352 4353 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4354 param_exomiser_phenopacket = full_path( 4355 param_exomiser_phenopacket 4356 ) 4357 4358 # If param phenopacket is a file and exists 4359 if isinstance( 4360 param_exomiser_phenopacket, str 4361 ) and os.path.exists(param_exomiser_phenopacket): 4362 # Load phenopacket file into analysis dict (either yaml or json) 4363 with open(param_exomiser_phenopacket) as json_file: 4364 param_exomiser_analysis_dict["phenopacket"] = ( 4365 yaml.safe_load(json_file) 4366 ) 4367 4368 # If param phenopacket is a dict 4369 elif isinstance(param_exomiser_phenopacket, dict): 4370 # Load phenopacket dict into analysis dict (either yaml or json) 4371 param_exomiser_analysis_dict["phenopacket"] = ( 4372 param_exomiser_phenopacket 4373 ) 4374 4375 # Error phenopacket type 4376 else: 4377 log.error(f"Phenopacket type unknown. Check param file.") 4378 raise ValueError( 4379 f"Phenopacket type unknown. Check param file." 4380 ) 4381 4382 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4383 if "phenopacket" not in param_exomiser_analysis_dict: 4384 4385 # Init PhenoPacket 4386 param_exomiser_analysis_dict["phenopacket"] = { 4387 "id": "analysis", 4388 "proband": {}, 4389 } 4390 4391 ### Add subject ### 4392 4393 # If subject exists 4394 param_exomiser_subject = param_exomiser.get("subject", {}) 4395 4396 # If subject not exists -> found sample ID 4397 if not param_exomiser_subject: 4398 4399 # Found sample ID in param 4400 sample = param_exomiser.get("sample", None) 4401 4402 # Find sample ID (first sample) 4403 if not sample: 4404 sample_list = self.get_header_sample_list() 4405 if len(sample_list) > 0: 4406 sample = sample_list[0] 4407 else: 4408 log.error(f"No sample found") 4409 raise ValueError(f"No sample found") 4410 4411 # Create subject 4412 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4413 4414 # Add to dict 4415 param_exomiser_analysis_dict["phenopacket"][ 4416 "subject" 4417 ] = param_exomiser_subject 4418 4419 ### Add "phenotypicFeatures" ### 4420 4421 # If phenotypicFeatures exists 4422 param_exomiser_phenotypicfeatures = param_exomiser.get( 4423 "phenotypicFeatures", [] 4424 ) 4425 4426 # If phenotypicFeatures not exists -> Try to infer from hpo list 4427 if not param_exomiser_phenotypicfeatures: 4428 4429 # Found HPO in param 4430 param_exomiser_hpo = param_exomiser.get("hpo", []) 4431 4432 # Split HPO if list in string format separated by comma 4433 if isinstance(param_exomiser_hpo, str): 4434 param_exomiser_hpo = param_exomiser_hpo.split(",") 4435 4436 # Create HPO list 4437 for hpo in param_exomiser_hpo: 4438 hpo_clean = re.sub("[^0-9]", "", hpo) 4439 param_exomiser_phenotypicfeatures.append( 4440 { 4441 "type": { 4442 "id": f"HP:{hpo_clean}", 4443 "label": f"HP:{hpo_clean}", 4444 } 4445 } 4446 ) 4447 4448 # Add to dict 4449 param_exomiser_analysis_dict["phenopacket"][ 4450 "phenotypicFeatures" 4451 ] = param_exomiser_phenotypicfeatures 4452 4453 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4454 if not param_exomiser_phenotypicfeatures: 4455 for step in param_exomiser_analysis_dict.get( 4456 "analysis", {} 4457 ).get("steps", []): 4458 if "hiPhivePrioritiser" in step: 4459 param_exomiser_analysis_dict.get("analysis", {}).get( 4460 "steps", [] 4461 ).remove(step) 4462 4463 ### Add Input File ### 4464 4465 # Initial file name and htsFiles 4466 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4467 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4468 { 4469 "uri": tmp_vcf_name, 4470 "htsFormat": "VCF", 4471 "genomeAssembly": assembly, 4472 } 4473 ] 4474 4475 ### Add metaData ### 4476 4477 # If metaData not in analysis dict 4478 if "metaData" not in param_exomiser_analysis_dict: 4479 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4480 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4481 "createdBy": "howard", 4482 "phenopacketSchemaVersion": 1, 4483 } 4484 4485 ### OutputOptions ### 4486 4487 # Init output result folder 4488 output_results = os.path.join(tmp_dir, "results") 4489 4490 # If no outputOptions in analysis dict 4491 if "outputOptions" not in param_exomiser_analysis_dict: 4492 4493 # default output formats 4494 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4495 4496 # Get outputOptions in param 4497 output_options = param_exomiser.get("outputOptions", None) 4498 4499 # If no output_options in param -> check 4500 if not output_options: 4501 output_options = { 4502 "outputContributingVariantsOnly": False, 4503 "numGenes": 0, 4504 "outputFormats": defaut_output_formats, 4505 } 4506 4507 # Replace outputDirectory in output options 4508 output_options["outputDirectory"] = output_results 4509 output_options["outputFileName"] = "howard" 4510 4511 # Add outputOptions in analysis dict 4512 param_exomiser_analysis_dict["outputOptions"] = output_options 4513 4514 else: 4515 4516 # Replace output_results and output format (if exists in param) 4517 param_exomiser_analysis_dict["outputOptions"][ 4518 "outputDirectory" 4519 ] = output_results 4520 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4521 list( 4522 set( 4523 param_exomiser_analysis_dict.get( 4524 "outputOptions", {} 4525 ).get("outputFormats", []) 4526 + ["TSV_VARIANT", "VCF"] 4527 ) 4528 ) 4529 ) 4530 4531 # log 4532 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4533 4534 ### ANALYSIS FILE ### 4535 ##################### 4536 4537 ### Full JSON analysis config file ### 4538 4539 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4540 with open(exomiser_analysis, "w") as fp: 4541 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4542 4543 ### SPLIT analysis and sample config files 4544 4545 # Splitted analysis dict 4546 param_exomiser_analysis_dict_for_split = ( 4547 param_exomiser_analysis_dict.copy() 4548 ) 4549 4550 # Phenopacket JSON file 4551 exomiser_analysis_phenopacket = os.path.join( 4552 tmp_dir, "analysis_phenopacket.json" 4553 ) 4554 with open(exomiser_analysis_phenopacket, "w") as fp: 4555 json.dump( 4556 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4557 fp, 4558 indent=4, 4559 ) 4560 4561 # Analysis JSON file without Phenopacket parameters 4562 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4563 exomiser_analysis_analysis = os.path.join( 4564 tmp_dir, "analysis_analysis.json" 4565 ) 4566 with open(exomiser_analysis_analysis, "w") as fp: 4567 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4568 4569 ### INITAL VCF file ### 4570 ####################### 4571 4572 ### Create list of samples to use and include inti initial VCF file #### 4573 4574 # Subject (main sample) 4575 # Get sample ID in analysis dict 4576 sample_subject = ( 4577 param_exomiser_analysis_dict.get("phenopacket", {}) 4578 .get("subject", {}) 4579 .get("id", None) 4580 ) 4581 sample_proband = ( 4582 param_exomiser_analysis_dict.get("phenopacket", {}) 4583 .get("proband", {}) 4584 .get("subject", {}) 4585 .get("id", None) 4586 ) 4587 sample = [] 4588 if sample_subject: 4589 sample.append(sample_subject) 4590 if sample_proband: 4591 sample.append(sample_proband) 4592 4593 # Get sample ID within Pedigree 4594 pedigree_persons_list = ( 4595 param_exomiser_analysis_dict.get("phenopacket", {}) 4596 .get("pedigree", {}) 4597 .get("persons", {}) 4598 ) 4599 4600 # Create list with all sample ID in pedigree (if exists) 4601 pedigree_persons = [] 4602 for person in pedigree_persons_list: 4603 pedigree_persons.append(person.get("individualId")) 4604 4605 # Concat subject sample ID and samples ID in pedigreesamples 4606 samples = list(set(sample + pedigree_persons)) 4607 4608 # Check if sample list is not empty 4609 if not samples: 4610 log.error(f"No samples found") 4611 raise ValueError(f"No samples found") 4612 4613 # Create VCF with sample (either sample in param or first one by default) 4614 # Export VCF file 4615 self.export_variant_vcf( 4616 vcf_file=tmp_vcf_name, 4617 remove_info=True, 4618 add_samples=True, 4619 list_samples=samples, 4620 index=False, 4621 ) 4622 4623 ### Execute Exomiser ### 4624 ######################## 4625 4626 # Init command 4627 exomiser_command = "" 4628 4629 # Command exomiser options 4630 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4631 4632 # Release 4633 exomiser_release = param_exomiser.get("release", None) 4634 if exomiser_release: 4635 # phenotype data version 4636 exomiser_options += ( 4637 f" --exomiser.phenotype.data-version={exomiser_release} " 4638 ) 4639 # data version 4640 exomiser_options += ( 4641 f" --exomiser.{assembly}.data-version={exomiser_release} " 4642 ) 4643 # variant white list 4644 variant_white_list_file = ( 4645 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4646 ) 4647 if os.path.exists( 4648 os.path.join( 4649 databases_folders, assembly, variant_white_list_file 4650 ) 4651 ): 4652 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4653 4654 # transcript_source 4655 transcript_source = param_exomiser.get( 4656 "transcript_source", None 4657 ) # ucsc, refseq, ensembl 4658 if transcript_source: 4659 exomiser_options += ( 4660 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4661 ) 4662 4663 # If analysis contain proband param 4664 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4665 "proband", {} 4666 ): 4667 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4668 4669 # If no proband (usually uniq sample) 4670 else: 4671 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4672 4673 # Log 4674 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4675 4676 # Run command 4677 result = subprocess.call( 4678 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4679 ) 4680 if result: 4681 log.error("Exomiser command failed") 4682 raise ValueError("Exomiser command failed") 4683 4684 ### RESULTS ### 4685 ############### 4686 4687 ### Annotate with TSV fields ### 4688 4689 # Init result tsv file 4690 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4691 4692 # Init result tsv file 4693 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4694 4695 # Parse TSV file and explode columns in INFO field 4696 if exomiser_to_info and os.path.exists(output_results_tsv): 4697 4698 # Log 4699 log.debug("Exomiser columns to VCF INFO field") 4700 4701 # Retrieve columns and types 4702 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4703 output_results_tsv_df = self.get_query_to_df(query) 4704 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4705 4706 # Init concat fields for update 4707 sql_query_update_concat_fields = [] 4708 4709 # Fields to avoid 4710 fields_to_avoid = [ 4711 "CONTIG", 4712 "START", 4713 "END", 4714 "REF", 4715 "ALT", 4716 "QUAL", 4717 "FILTER", 4718 "GENOTYPE", 4719 ] 4720 4721 # List all columns to add into header 4722 for header_column in output_results_tsv_columns: 4723 4724 # If header column is enable 4725 if header_column not in fields_to_avoid: 4726 4727 # Header info type 4728 header_info_type = "String" 4729 header_column_df = output_results_tsv_df[header_column] 4730 header_column_df_dtype = header_column_df.dtype 4731 if header_column_df_dtype == object: 4732 if ( 4733 pd.to_numeric(header_column_df, errors="coerce") 4734 .notnull() 4735 .all() 4736 ): 4737 header_info_type = "Float" 4738 else: 4739 header_info_type = "Integer" 4740 4741 # Header info 4742 characters_to_validate = ["-"] 4743 pattern = "[" + "".join(characters_to_validate) + "]" 4744 header_info_name = re.sub( 4745 pattern, 4746 "_", 4747 f"Exomiser_{header_column}".replace("#", ""), 4748 ) 4749 header_info_number = "." 4750 header_info_description = ( 4751 f"Exomiser {header_column} annotation" 4752 ) 4753 header_info_source = "Exomiser" 4754 header_info_version = "unknown" 4755 header_info_code = CODE_TYPE_MAP[header_info_type] 4756 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4757 header_info_name, 4758 header_info_number, 4759 header_info_type, 4760 header_info_description, 4761 header_info_source, 4762 header_info_version, 4763 header_info_code, 4764 ) 4765 4766 # Add field to add for update to concat fields 4767 sql_query_update_concat_fields.append( 4768 f""" 4769 CASE 4770 WHEN table_parquet."{header_column}" NOT IN ('','.') 4771 THEN concat( 4772 '{header_info_name}=', 4773 table_parquet."{header_column}", 4774 ';' 4775 ) 4776 4777 ELSE '' 4778 END 4779 """ 4780 ) 4781 4782 # Update query 4783 sql_query_update = f""" 4784 UPDATE {table_variants} as table_variants 4785 SET INFO = concat( 4786 CASE 4787 WHEN INFO NOT IN ('', '.') 4788 THEN INFO 4789 ELSE '' 4790 END, 4791 CASE 4792 WHEN table_variants.INFO NOT IN ('','.') 4793 THEN ';' 4794 ELSE '' 4795 END, 4796 ( 4797 SELECT 4798 concat( 4799 {",".join(sql_query_update_concat_fields)} 4800 ) 4801 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4802 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4803 AND table_parquet.\"START\" = table_variants.\"POS\" 4804 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4805 AND table_parquet.\"REF\" = table_variants.\"REF\" 4806 ) 4807 ) 4808 ; 4809 """ 4810 4811 # Update 4812 self.conn.execute(sql_query_update) 4813 4814 ### Annotate with VCF INFO field ### 4815 4816 # Init result VCF file 4817 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4818 4819 # If VCF exists 4820 if os.path.exists(output_results_vcf): 4821 4822 # Log 4823 log.debug("Exomiser result VCF update variants") 4824 4825 # Find Exomiser INFO field annotation in header 4826 with gzip.open(output_results_vcf, "rt") as f: 4827 header_list = self.read_vcf_header(f) 4828 exomiser_vcf_header = vcf.Reader( 4829 io.StringIO("\n".join(header_list)) 4830 ) 4831 4832 # Add annotation INFO field to header 4833 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4834 4835 # Update variants with VCF 4836 self.update_from_vcf(output_results_vcf) 4837 4838 return True 4839 4840 def annotation_snpeff(self, threads: int = None) -> None: 4841 """ 4842 This function annotate with snpEff 4843 4844 :param threads: The number of threads to use 4845 :return: the value of the variable "return_value". 4846 """ 4847 4848 # DEBUG 4849 log.debug("Start annotation with snpeff databases") 4850 4851 # Threads 4852 if not threads: 4853 threads = self.get_threads() 4854 log.debug("Threads: " + str(threads)) 4855 4856 # DEBUG 4857 delete_tmp = True 4858 if self.get_config().get("verbosity", "warning") in ["debug"]: 4859 delete_tmp = False 4860 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4861 4862 # Config 4863 config = self.get_config() 4864 log.debug("Config: " + str(config)) 4865 4866 # Config - Folders - Databases 4867 databases_folders = ( 4868 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4869 ) 4870 log.debug("Databases annotations: " + str(databases_folders)) 4871 4872 # # Config - Java 4873 # java_bin = get_bin( 4874 # tool="java", 4875 # bin="java", 4876 # bin_type="bin", 4877 # config=config, 4878 # default_folder="/usr/bin", 4879 # ) 4880 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4881 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4882 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4883 4884 # # Config - snpEff bin 4885 # snpeff_jar = get_bin( 4886 # tool="snpeff", 4887 # bin="snpEff.jar", 4888 # bin_type="jar", 4889 # config=config, 4890 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4891 # ) 4892 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4893 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4894 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4895 4896 # Config - snpEff bin command 4897 snpeff_bin_command = get_bin_command( 4898 bin="snpEff.jar", 4899 tool="snpeff", 4900 bin_type="jar", 4901 config=config, 4902 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4903 ) 4904 if not snpeff_bin_command: 4905 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4906 log.error(msg_err) 4907 raise ValueError(msg_err) 4908 4909 # Config - snpEff databases 4910 snpeff_databases = ( 4911 config.get("folders", {}) 4912 .get("databases", {}) 4913 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4914 ) 4915 snpeff_databases = full_path(snpeff_databases) 4916 if snpeff_databases is not None and snpeff_databases != "": 4917 log.debug(f"Create snpEff databases folder") 4918 if not os.path.exists(snpeff_databases): 4919 os.makedirs(snpeff_databases) 4920 4921 # Param 4922 param = self.get_param() 4923 log.debug("Param: " + str(param)) 4924 4925 # Param 4926 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4927 log.debug("Options: " + str(options)) 4928 4929 # Param - Assembly 4930 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4931 4932 # Param - Options 4933 snpeff_options = ( 4934 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4935 ) 4936 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4937 snpeff_csvstats = ( 4938 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4939 ) 4940 if snpeff_stats: 4941 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4942 snpeff_stats = full_path(snpeff_stats) 4943 snpeff_options += f" -stats {snpeff_stats}" 4944 if snpeff_csvstats: 4945 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4946 snpeff_csvstats = full_path(snpeff_csvstats) 4947 snpeff_options += f" -csvStats {snpeff_csvstats}" 4948 4949 # Data 4950 table_variants = self.get_table_variants() 4951 4952 # Check if not empty 4953 log.debug("Check if not empty") 4954 sql_query_chromosomes = ( 4955 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4956 ) 4957 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4958 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4959 log.info(f"VCF empty") 4960 return 4961 4962 # Export in VCF 4963 log.debug("Create initial file to annotate") 4964 tmp_vcf = NamedTemporaryFile( 4965 prefix=self.get_prefix(), 4966 dir=self.get_tmp_dir(), 4967 suffix=".vcf.gz", 4968 delete=True, 4969 ) 4970 tmp_vcf_name = tmp_vcf.name 4971 4972 # VCF header 4973 vcf_reader = self.get_header() 4974 log.debug("Initial header: " + str(vcf_reader.infos)) 4975 4976 # Existing annotations 4977 for vcf_annotation in self.get_header().infos: 4978 4979 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4980 log.debug( 4981 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4982 ) 4983 4984 # Memory limit 4985 # if config.get("memory", None): 4986 # memory_limit = config.get("memory", "8G") 4987 # else: 4988 # memory_limit = "8G" 4989 memory_limit = self.get_memory("8G") 4990 log.debug(f"memory_limit: {memory_limit}") 4991 4992 # snpEff java options 4993 snpeff_java_options = ( 4994 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4995 ) 4996 log.debug(f"Exomiser java options: {snpeff_java_options}") 4997 4998 force_update_annotation = True 4999 5000 if "ANN" not in self.get_header().infos or force_update_annotation: 5001 5002 # Check snpEff database 5003 log.debug(f"Check snpEff databases {[assembly]}") 5004 databases_download_snpeff( 5005 folder=snpeff_databases, assemblies=[assembly], config=config 5006 ) 5007 5008 # Export VCF file 5009 self.export_variant_vcf( 5010 vcf_file=tmp_vcf_name, 5011 remove_info=True, 5012 add_samples=False, 5013 index=True, 5014 ) 5015 5016 # Tmp file 5017 err_files = [] 5018 tmp_annotate_vcf = NamedTemporaryFile( 5019 prefix=self.get_prefix(), 5020 dir=self.get_tmp_dir(), 5021 suffix=".vcf", 5022 delete=False, 5023 ) 5024 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5025 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5026 err_files.append(tmp_annotate_vcf_name_err) 5027 5028 # Command 5029 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5030 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5031 run_parallel_commands([snpeff_command], 1) 5032 5033 # Error messages 5034 log.info(f"Error/Warning messages:") 5035 error_message_command_all = [] 5036 error_message_command_warning = [] 5037 error_message_command_err = [] 5038 for err_file in err_files: 5039 with open(err_file, "r") as f: 5040 for line in f: 5041 message = line.strip() 5042 error_message_command_all.append(message) 5043 if line.startswith("[W::"): 5044 error_message_command_warning.append(message) 5045 if line.startswith("[E::"): 5046 error_message_command_err.append(f"{err_file}: " + message) 5047 # log info 5048 for message in list( 5049 set(error_message_command_err + error_message_command_warning) 5050 ): 5051 log.info(f" {message}") 5052 # debug info 5053 for message in list(set(error_message_command_all)): 5054 log.debug(f" {message}") 5055 # failed 5056 if len(error_message_command_err): 5057 log.error("Annotation failed: Error in commands") 5058 raise ValueError("Annotation failed: Error in commands") 5059 5060 # Find annotation in header 5061 with open(tmp_annotate_vcf_name, "rt") as f: 5062 header_list = self.read_vcf_header(f) 5063 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5064 5065 for ann in annovar_vcf_header.infos: 5066 if ann not in self.get_header().infos: 5067 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5068 5069 # Update variants 5070 log.info(f"Annotation - Updating...") 5071 self.update_from_vcf(tmp_annotate_vcf_name) 5072 5073 else: 5074 if "ANN" in self.get_header().infos: 5075 log.debug(f"Existing snpEff annotations in VCF") 5076 if force_update_annotation: 5077 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5078 5079 def annotation_annovar(self, threads: int = None) -> None: 5080 """ 5081 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5082 annotations 5083 5084 :param threads: number of threads to use 5085 :return: the value of the variable "return_value". 5086 """ 5087 5088 # DEBUG 5089 log.debug("Start annotation with Annovar databases") 5090 5091 # Threads 5092 if not threads: 5093 threads = self.get_threads() 5094 log.debug("Threads: " + str(threads)) 5095 5096 # Tmp en Err files 5097 tmp_files = [] 5098 err_files = [] 5099 5100 # DEBUG 5101 delete_tmp = True 5102 if self.get_config().get("verbosity", "warning") in ["debug"]: 5103 delete_tmp = False 5104 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5105 5106 # Config 5107 config = self.get_config() 5108 log.debug("Config: " + str(config)) 5109 5110 # Config - Folders - Databases 5111 databases_folders = ( 5112 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5113 ) 5114 log.debug("Databases annotations: " + str(databases_folders)) 5115 5116 # Config - annovar bin command 5117 annovar_bin_command = get_bin_command( 5118 bin="table_annovar.pl", 5119 tool="annovar", 5120 bin_type="perl", 5121 config=config, 5122 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5123 ) 5124 if not annovar_bin_command: 5125 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5126 log.error(msg_err) 5127 raise ValueError(msg_err) 5128 5129 # Config - BCFTools bin command 5130 bcftools_bin_command = get_bin_command( 5131 bin="bcftools", 5132 tool="bcftools", 5133 bin_type="bin", 5134 config=config, 5135 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5136 ) 5137 if not bcftools_bin_command: 5138 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5139 log.error(msg_err) 5140 raise ValueError(msg_err) 5141 5142 # Config - annovar databases 5143 annovar_databases = ( 5144 config.get("folders", {}) 5145 .get("databases", {}) 5146 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5147 ) 5148 annovar_databases = full_path(annovar_databases) 5149 if annovar_databases != "" and not os.path.exists(annovar_databases): 5150 os.makedirs(annovar_databases) 5151 5152 # Param 5153 param = self.get_param() 5154 log.debug("Param: " + str(param)) 5155 5156 # Param - options 5157 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5158 log.debug("Options: " + str(options)) 5159 5160 # Param - annotations 5161 annotations = ( 5162 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5163 ) 5164 log.debug("Annotations: " + str(annotations)) 5165 5166 # Param - Assembly 5167 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5168 5169 # Annovar database assembly 5170 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5171 if annovar_databases_assembly != "" and not os.path.exists( 5172 annovar_databases_assembly 5173 ): 5174 os.makedirs(annovar_databases_assembly) 5175 5176 # Data 5177 table_variants = self.get_table_variants() 5178 5179 # Check if not empty 5180 log.debug("Check if not empty") 5181 sql_query_chromosomes = ( 5182 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5183 ) 5184 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5185 if not sql_query_chromosomes_df["count"][0]: 5186 log.info(f"VCF empty") 5187 return 5188 5189 # VCF header 5190 vcf_reader = self.get_header() 5191 log.debug("Initial header: " + str(vcf_reader.infos)) 5192 5193 # Existing annotations 5194 for vcf_annotation in self.get_header().infos: 5195 5196 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5197 log.debug( 5198 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5199 ) 5200 5201 force_update_annotation = True 5202 5203 if annotations: 5204 5205 commands = [] 5206 tmp_annotates_vcf_name_list = [] 5207 5208 # Export in VCF 5209 log.debug("Create initial file to annotate") 5210 tmp_vcf = NamedTemporaryFile( 5211 prefix=self.get_prefix(), 5212 dir=self.get_tmp_dir(), 5213 suffix=".vcf.gz", 5214 delete=False, 5215 ) 5216 tmp_vcf_name = tmp_vcf.name 5217 tmp_files.append(tmp_vcf_name) 5218 tmp_files.append(tmp_vcf_name + ".tbi") 5219 5220 # Export VCF file 5221 self.export_variant_vcf( 5222 vcf_file=tmp_vcf_name, 5223 remove_info=".", 5224 add_samples=False, 5225 index=True, 5226 ) 5227 5228 # Create file for field rename 5229 log.debug("Create file for field rename") 5230 tmp_rename = NamedTemporaryFile( 5231 prefix=self.get_prefix(), 5232 dir=self.get_tmp_dir(), 5233 suffix=".rename", 5234 delete=False, 5235 ) 5236 tmp_rename_name = tmp_rename.name 5237 tmp_files.append(tmp_rename_name) 5238 5239 # Check Annovar database 5240 log.debug( 5241 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5242 ) 5243 databases_download_annovar( 5244 folder=annovar_databases, 5245 files=list(annotations.keys()), 5246 assemblies=[assembly], 5247 ) 5248 5249 for annotation in annotations: 5250 annotation_fields = annotations[annotation] 5251 5252 if not annotation_fields: 5253 annotation_fields = {"INFO": None} 5254 5255 log.info(f"Annotations Annovar - database '{annotation}'") 5256 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5257 5258 # Tmp file for annovar 5259 err_files = [] 5260 tmp_annotate_vcf_directory = TemporaryDirectory( 5261 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5262 ) 5263 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5264 tmp_annotate_vcf_name_annovar = ( 5265 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5266 ) 5267 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5268 err_files.append(tmp_annotate_vcf_name_err) 5269 tmp_files.append(tmp_annotate_vcf_name_err) 5270 5271 # Tmp file final vcf annotated by annovar 5272 tmp_annotate_vcf = NamedTemporaryFile( 5273 prefix=self.get_prefix(), 5274 dir=self.get_tmp_dir(), 5275 suffix=".vcf.gz", 5276 delete=False, 5277 ) 5278 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5279 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5280 tmp_files.append(tmp_annotate_vcf_name) 5281 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5282 5283 # Number of fields 5284 annotation_list = [] 5285 annotation_renamed_list = [] 5286 5287 for annotation_field in annotation_fields: 5288 5289 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5290 annotation_fields_new_name = annotation_fields.get( 5291 annotation_field, annotation_field 5292 ) 5293 if not annotation_fields_new_name: 5294 annotation_fields_new_name = annotation_field 5295 5296 if ( 5297 force_update_annotation 5298 or annotation_fields_new_name not in self.get_header().infos 5299 ): 5300 annotation_list.append(annotation_field) 5301 annotation_renamed_list.append(annotation_fields_new_name) 5302 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5303 log.warning( 5304 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5305 ) 5306 5307 # Add rename info 5308 run_parallel_commands( 5309 [ 5310 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5311 ], 5312 1, 5313 ) 5314 5315 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5316 log.debug("annotation_list: " + str(annotation_list)) 5317 5318 # protocol 5319 protocol = annotation 5320 5321 # argument 5322 argument = "" 5323 5324 # operation 5325 operation = "f" 5326 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5327 "ensGene" 5328 ): 5329 operation = "g" 5330 if options.get("genebase", None): 5331 argument = f"""'{options.get("genebase","")}'""" 5332 elif annotation in ["cytoBand"]: 5333 operation = "r" 5334 5335 # argument option 5336 argument_option = "" 5337 if argument != "": 5338 argument_option = " --argument " + argument 5339 5340 # command options 5341 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5342 for option in options: 5343 if option not in ["genebase"]: 5344 command_options += f""" --{option}={options[option]}""" 5345 5346 # Command 5347 5348 # Command - Annovar 5349 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5350 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5351 5352 # Command - start pipe 5353 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5354 5355 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5356 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5357 5358 # Command - Special characters (refGene annotation) 5359 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5360 5361 # Command - Clean empty fields (with value ".") 5362 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5363 5364 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5365 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5366 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5367 # for ann in annotation_renamed_list: 5368 for ann in annotation_list: 5369 annovar_fields_to_keep.append(f"^INFO/{ann}") 5370 5371 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5372 5373 # Command - indexing 5374 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5375 5376 log.debug(f"Annotation - Annovar command: {command_annovar}") 5377 run_parallel_commands([command_annovar], 1) 5378 5379 # Error messages 5380 log.info(f"Error/Warning messages:") 5381 error_message_command_all = [] 5382 error_message_command_warning = [] 5383 error_message_command_err = [] 5384 for err_file in err_files: 5385 with open(err_file, "r") as f: 5386 for line in f: 5387 message = line.strip() 5388 error_message_command_all.append(message) 5389 if line.startswith("[W::") or line.startswith("WARNING"): 5390 error_message_command_warning.append(message) 5391 if line.startswith("[E::") or line.startswith("ERROR"): 5392 error_message_command_err.append( 5393 f"{err_file}: " + message 5394 ) 5395 # log info 5396 for message in list( 5397 set(error_message_command_err + error_message_command_warning) 5398 ): 5399 log.info(f" {message}") 5400 # debug info 5401 for message in list(set(error_message_command_all)): 5402 log.debug(f" {message}") 5403 # failed 5404 if len(error_message_command_err): 5405 log.error("Annotation failed: Error in commands") 5406 raise ValueError("Annotation failed: Error in commands") 5407 5408 if tmp_annotates_vcf_name_list: 5409 5410 # List of annotated files 5411 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5412 5413 # Tmp file 5414 tmp_annotate_vcf = NamedTemporaryFile( 5415 prefix=self.get_prefix(), 5416 dir=self.get_tmp_dir(), 5417 suffix=".vcf.gz", 5418 delete=False, 5419 ) 5420 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5421 tmp_files.append(tmp_annotate_vcf_name) 5422 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5423 err_files.append(tmp_annotate_vcf_name_err) 5424 tmp_files.append(tmp_annotate_vcf_name_err) 5425 5426 # Command merge 5427 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5428 log.info( 5429 f"Annotation Annovar - Annotation merging " 5430 + str(len(tmp_annotates_vcf_name_list)) 5431 + " annotated files" 5432 ) 5433 log.debug(f"Annotation - merge command: {merge_command}") 5434 run_parallel_commands([merge_command], 1) 5435 5436 # Find annotation in header 5437 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5438 header_list = self.read_vcf_header(f) 5439 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5440 5441 for ann in annovar_vcf_header.infos: 5442 if ann not in self.get_header().infos: 5443 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5444 5445 # Update variants 5446 log.info(f"Annotation Annovar - Updating...") 5447 self.update_from_vcf(tmp_annotate_vcf_name) 5448 5449 # Clean files 5450 # Tmp file remove command 5451 if True: 5452 tmp_files_remove_command = "" 5453 if tmp_files: 5454 tmp_files_remove_command = " ".join(tmp_files) 5455 clean_command = f" rm -f {tmp_files_remove_command} " 5456 log.debug(f"Annotation Annovar - Annotation cleaning ") 5457 log.debug(f"Annotation - cleaning command: {clean_command}") 5458 run_parallel_commands([clean_command], 1) 5459 5460 # Parquet 5461 def annotation_parquet(self, threads: int = None) -> None: 5462 """ 5463 It takes a VCF file, and annotates it with a parquet file 5464 5465 :param threads: number of threads to use for the annotation 5466 :return: the value of the variable "result". 5467 """ 5468 5469 # DEBUG 5470 log.debug("Start annotation with parquet databases") 5471 5472 # Threads 5473 if not threads: 5474 threads = self.get_threads() 5475 log.debug("Threads: " + str(threads)) 5476 5477 # DEBUG 5478 delete_tmp = True 5479 if self.get_config().get("verbosity", "warning") in ["debug"]: 5480 delete_tmp = False 5481 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5482 5483 # Config 5484 databases_folders = set( 5485 self.get_config() 5486 .get("folders", {}) 5487 .get("databases", {}) 5488 .get("annotations", ["."]) 5489 + self.get_config() 5490 .get("folders", {}) 5491 .get("databases", {}) 5492 .get("parquet", ["."]) 5493 ) 5494 log.debug("Databases annotations: " + str(databases_folders)) 5495 5496 # Param 5497 annotations = ( 5498 self.get_param() 5499 .get("annotation", {}) 5500 .get("parquet", {}) 5501 .get("annotations", None) 5502 ) 5503 log.debug("Annotations: " + str(annotations)) 5504 5505 # Assembly 5506 assembly = self.get_param().get( 5507 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5508 ) 5509 5510 # Force Update Annotation 5511 force_update_annotation = ( 5512 self.get_param() 5513 .get("annotation", {}) 5514 .get("options", {}) 5515 .get("annotations_update", False) 5516 ) 5517 log.debug(f"force_update_annotation={force_update_annotation}") 5518 force_append_annotation = ( 5519 self.get_param() 5520 .get("annotation", {}) 5521 .get("options", {}) 5522 .get("annotations_append", False) 5523 ) 5524 log.debug(f"force_append_annotation={force_append_annotation}") 5525 5526 # Data 5527 table_variants = self.get_table_variants() 5528 5529 # Check if not empty 5530 log.debug("Check if not empty") 5531 sql_query_chromosomes_df = self.get_query_to_df( 5532 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5533 ) 5534 if not sql_query_chromosomes_df["count"][0]: 5535 log.info(f"VCF empty") 5536 return 5537 5538 # VCF header 5539 vcf_reader = self.get_header() 5540 log.debug("Initial header: " + str(vcf_reader.infos)) 5541 5542 # Nb Variants POS 5543 log.debug("NB Variants Start") 5544 nb_variants = self.conn.execute( 5545 f"SELECT count(*) AS count FROM variants" 5546 ).fetchdf()["count"][0] 5547 log.debug("NB Variants Stop") 5548 5549 # Existing annotations 5550 for vcf_annotation in self.get_header().infos: 5551 5552 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5553 log.debug( 5554 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5555 ) 5556 5557 # Added columns 5558 added_columns = [] 5559 5560 # drop indexes 5561 log.debug(f"Drop indexes...") 5562 self.drop_indexes() 5563 5564 if annotations: 5565 5566 if "ALL" in annotations: 5567 5568 all_param = annotations.get("ALL", {}) 5569 all_param_formats = all_param.get("formats", None) 5570 all_param_releases = all_param.get("releases", None) 5571 5572 databases_infos_dict = self.scan_databases( 5573 database_formats=all_param_formats, 5574 database_releases=all_param_releases, 5575 ) 5576 for database_infos in databases_infos_dict.keys(): 5577 if database_infos not in annotations: 5578 annotations[database_infos] = {"INFO": None} 5579 5580 for annotation in annotations: 5581 5582 if annotation in ["ALL"]: 5583 continue 5584 5585 # Annotation Name 5586 annotation_name = os.path.basename(annotation) 5587 5588 # Annotation fields 5589 annotation_fields = annotations[annotation] 5590 if not annotation_fields: 5591 annotation_fields = {"INFO": None} 5592 5593 log.debug(f"Annotation '{annotation_name}'") 5594 log.debug( 5595 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5596 ) 5597 5598 # Create Database 5599 database = Database( 5600 database=annotation, 5601 databases_folders=databases_folders, 5602 assembly=assembly, 5603 ) 5604 5605 # Find files 5606 parquet_file = database.get_database() 5607 parquet_hdr_file = database.get_header_file() 5608 parquet_type = database.get_type() 5609 5610 # Check if files exists 5611 if not parquet_file or not parquet_hdr_file: 5612 log.error("Annotation failed: file not found") 5613 raise ValueError("Annotation failed: file not found") 5614 else: 5615 # Get parquet connexion 5616 parquet_sql_attach = database.get_sql_database_attach( 5617 output="query" 5618 ) 5619 if parquet_sql_attach: 5620 self.conn.execute(parquet_sql_attach) 5621 parquet_file_link = database.get_sql_database_link() 5622 # Log 5623 log.debug( 5624 f"Annotation '{annotation_name}' - file: " 5625 + str(parquet_file) 5626 + " and " 5627 + str(parquet_hdr_file) 5628 ) 5629 5630 # Database full header columns 5631 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5632 parquet_hdr_file 5633 ) 5634 # Log 5635 log.debug( 5636 "Annotation database header columns : " 5637 + str(parquet_hdr_vcf_header_columns) 5638 ) 5639 5640 # Load header as VCF object 5641 parquet_hdr_vcf_header_infos = database.get_header().infos 5642 # Log 5643 log.debug( 5644 "Annotation database header: " 5645 + str(parquet_hdr_vcf_header_infos) 5646 ) 5647 5648 # Get extra infos 5649 parquet_columns = database.get_extra_columns() 5650 # Log 5651 log.debug("Annotation database Columns: " + str(parquet_columns)) 5652 5653 # Add extra columns if "ALL" in annotation_fields 5654 # if "ALL" in annotation_fields: 5655 # allow_add_extra_column = True 5656 if "ALL" in annotation_fields and database.get_extra_columns(): 5657 for extra_column in database.get_extra_columns(): 5658 if ( 5659 extra_column not in annotation_fields 5660 and extra_column.replace("INFO/", "") 5661 not in parquet_hdr_vcf_header_infos 5662 ): 5663 parquet_hdr_vcf_header_infos[extra_column] = ( 5664 vcf.parser._Info( 5665 extra_column, 5666 ".", 5667 "String", 5668 f"{extra_column} description", 5669 "unknown", 5670 "unknown", 5671 self.code_type_map["String"], 5672 ) 5673 ) 5674 5675 # For all fields in database 5676 annotation_fields_all = False 5677 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5678 annotation_fields_all = True 5679 annotation_fields = { 5680 key: key for key in parquet_hdr_vcf_header_infos 5681 } 5682 5683 log.debug( 5684 "Annotation database header - All annotations added: " 5685 + str(annotation_fields) 5686 ) 5687 5688 # Init 5689 5690 # List of annotation fields to use 5691 sql_query_annotation_update_info_sets = [] 5692 5693 # List of annotation to agregate 5694 sql_query_annotation_to_agregate = [] 5695 5696 # Number of fields 5697 nb_annotation_field = 0 5698 5699 # Annotation fields processed 5700 annotation_fields_processed = [] 5701 5702 # Columns mapping 5703 map_columns = database.map_columns( 5704 columns=annotation_fields, prefixes=["INFO/"] 5705 ) 5706 5707 # Query dict for fields to remove (update option) 5708 query_dict_remove = {} 5709 5710 # Fetch Anotation fields 5711 for annotation_field in annotation_fields: 5712 5713 # annotation_field_column 5714 annotation_field_column = map_columns.get( 5715 annotation_field, "INFO" 5716 ) 5717 5718 # field new name, if parametered 5719 annotation_fields_new_name = annotation_fields.get( 5720 annotation_field, annotation_field 5721 ) 5722 if not annotation_fields_new_name: 5723 annotation_fields_new_name = annotation_field 5724 5725 # To annotate 5726 # force_update_annotation = True 5727 # force_append_annotation = True 5728 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5729 if annotation_field in parquet_hdr_vcf_header_infos and ( 5730 force_update_annotation 5731 or force_append_annotation 5732 or ( 5733 annotation_fields_new_name 5734 not in self.get_header().infos 5735 ) 5736 ): 5737 5738 # Add field to annotation to process list 5739 annotation_fields_processed.append( 5740 annotation_fields_new_name 5741 ) 5742 5743 # explode infos for the field 5744 annotation_fields_new_name_info_msg = "" 5745 if ( 5746 force_update_annotation 5747 and annotation_fields_new_name 5748 in self.get_header().infos 5749 ): 5750 # Remove field from INFO 5751 query = f""" 5752 UPDATE {table_variants} as table_variants 5753 SET INFO = REGEXP_REPLACE( 5754 concat(table_variants.INFO,''), 5755 ';*{annotation_fields_new_name}=[^;]*', 5756 '' 5757 ) 5758 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5759 """ 5760 annotation_fields_new_name_info_msg = " [update]" 5761 query_dict_remove[ 5762 f"remove 'INFO/{annotation_fields_new_name}'" 5763 ] = query 5764 5765 # Sep between fields in INFO 5766 nb_annotation_field += 1 5767 if nb_annotation_field > 1: 5768 annotation_field_sep = ";" 5769 else: 5770 annotation_field_sep = "" 5771 5772 log.info( 5773 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5774 ) 5775 5776 # Add INFO field to header 5777 parquet_hdr_vcf_header_infos_number = ( 5778 parquet_hdr_vcf_header_infos[annotation_field].num 5779 or "." 5780 ) 5781 parquet_hdr_vcf_header_infos_type = ( 5782 parquet_hdr_vcf_header_infos[annotation_field].type 5783 or "String" 5784 ) 5785 parquet_hdr_vcf_header_infos_description = ( 5786 parquet_hdr_vcf_header_infos[annotation_field].desc 5787 or f"{annotation_field} description" 5788 ) 5789 parquet_hdr_vcf_header_infos_source = ( 5790 parquet_hdr_vcf_header_infos[annotation_field].source 5791 or "unknown" 5792 ) 5793 parquet_hdr_vcf_header_infos_version = ( 5794 parquet_hdr_vcf_header_infos[annotation_field].version 5795 or "unknown" 5796 ) 5797 5798 vcf_reader.infos[annotation_fields_new_name] = ( 5799 vcf.parser._Info( 5800 annotation_fields_new_name, 5801 parquet_hdr_vcf_header_infos_number, 5802 parquet_hdr_vcf_header_infos_type, 5803 parquet_hdr_vcf_header_infos_description, 5804 parquet_hdr_vcf_header_infos_source, 5805 parquet_hdr_vcf_header_infos_version, 5806 self.code_type_map[ 5807 parquet_hdr_vcf_header_infos_type 5808 ], 5809 ) 5810 ) 5811 5812 # Append 5813 if force_append_annotation: 5814 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5815 else: 5816 query_case_when_append = "" 5817 5818 # Annotation/Update query fields 5819 # Found in INFO column 5820 if ( 5821 annotation_field_column == "INFO" 5822 and "INFO" in parquet_hdr_vcf_header_columns 5823 ): 5824 sql_query_annotation_update_info_sets.append( 5825 f""" 5826 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5827 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5828 ELSE '' 5829 END 5830 """ 5831 ) 5832 # Found in a specific column 5833 else: 5834 sql_query_annotation_update_info_sets.append( 5835 f""" 5836 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 5837 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 5838 ELSE '' 5839 END 5840 """ 5841 ) 5842 sql_query_annotation_to_agregate.append( 5843 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5844 ) 5845 5846 # Not to annotate 5847 else: 5848 5849 if force_update_annotation: 5850 annotation_message = "forced" 5851 else: 5852 annotation_message = "skipped" 5853 5854 if annotation_field not in parquet_hdr_vcf_header_infos: 5855 log.warning( 5856 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5857 ) 5858 if annotation_fields_new_name in self.get_header().infos: 5859 log.warning( 5860 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5861 ) 5862 5863 # Check if ALL fields have to be annotated. Thus concat all INFO field 5864 # allow_annotation_full_info = True 5865 allow_annotation_full_info = not force_append_annotation 5866 5867 if parquet_type in ["regions"]: 5868 allow_annotation_full_info = False 5869 5870 if ( 5871 allow_annotation_full_info 5872 and nb_annotation_field == len(annotation_fields) 5873 and annotation_fields_all 5874 and ( 5875 "INFO" in parquet_hdr_vcf_header_columns 5876 and "INFO" in database.get_extra_columns() 5877 ) 5878 ): 5879 log.debug("Column INFO annotation enabled") 5880 sql_query_annotation_update_info_sets = [] 5881 sql_query_annotation_update_info_sets.append( 5882 f" table_parquet.INFO " 5883 ) 5884 5885 if sql_query_annotation_update_info_sets: 5886 5887 # Annotate 5888 log.info(f"Annotation '{annotation_name}' - Annotation...") 5889 5890 # Join query annotation update info sets for SQL 5891 sql_query_annotation_update_info_sets_sql = ",".join( 5892 sql_query_annotation_update_info_sets 5893 ) 5894 5895 # Check chromosomes list (and variants infos) 5896 sql_query_chromosomes = f""" 5897 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5898 FROM {table_variants} as table_variants 5899 GROUP BY table_variants."#CHROM" 5900 ORDER BY table_variants."#CHROM" 5901 """ 5902 sql_query_chromosomes_df = self.conn.execute( 5903 sql_query_chromosomes 5904 ).df() 5905 sql_query_chromosomes_dict = { 5906 entry["CHROM"]: { 5907 "count": entry["count_variants"], 5908 "min": entry["min_variants"], 5909 "max": entry["max_variants"], 5910 } 5911 for index, entry in sql_query_chromosomes_df.iterrows() 5912 } 5913 5914 # Init 5915 nb_of_query = 0 5916 nb_of_variant_annotated = 0 5917 query_dict = query_dict_remove 5918 5919 # for chrom in sql_query_chromosomes_df["CHROM"]: 5920 for chrom in sql_query_chromosomes_dict: 5921 5922 # Number of variant by chromosome 5923 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5924 chrom, {} 5925 ).get("count", 0) 5926 5927 log.debug( 5928 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5929 ) 5930 5931 # Annotation with regions database 5932 if parquet_type in ["regions"]: 5933 sql_query_annotation_from_clause = f""" 5934 FROM ( 5935 SELECT 5936 '{chrom}' AS \"#CHROM\", 5937 table_variants_from.\"POS\" AS \"POS\", 5938 {",".join(sql_query_annotation_to_agregate)} 5939 FROM {table_variants} as table_variants_from 5940 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5941 table_parquet_from."#CHROM" = '{chrom}' 5942 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5943 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5944 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5945 ) 5946 ) 5947 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5948 GROUP BY table_variants_from.\"POS\" 5949 ) 5950 as table_parquet 5951 """ 5952 5953 sql_query_annotation_where_clause = """ 5954 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5955 AND table_parquet.\"POS\" = table_variants.\"POS\" 5956 """ 5957 5958 # Annotation with variants database 5959 else: 5960 sql_query_annotation_from_clause = f""" 5961 FROM {parquet_file_link} as table_parquet 5962 """ 5963 sql_query_annotation_where_clause = f""" 5964 table_variants."#CHROM" = '{chrom}' 5965 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5966 AND table_parquet.\"POS\" = table_variants.\"POS\" 5967 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5968 AND table_parquet.\"REF\" = table_variants.\"REF\" 5969 """ 5970 5971 # Create update query 5972 sql_query_annotation_chrom_interval_pos = f""" 5973 UPDATE {table_variants} as table_variants 5974 SET INFO = 5975 concat( 5976 CASE WHEN table_variants.INFO NOT IN ('','.') 5977 THEN table_variants.INFO 5978 ELSE '' 5979 END 5980 , 5981 CASE WHEN table_variants.INFO NOT IN ('','.') 5982 AND ( 5983 concat({sql_query_annotation_update_info_sets_sql}) 5984 ) 5985 NOT IN ('','.') 5986 THEN ';' 5987 ELSE '' 5988 END 5989 , 5990 {sql_query_annotation_update_info_sets_sql} 5991 ) 5992 {sql_query_annotation_from_clause} 5993 WHERE {sql_query_annotation_where_clause} 5994 ; 5995 """ 5996 5997 # Add update query to dict 5998 query_dict[ 5999 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6000 ] = sql_query_annotation_chrom_interval_pos 6001 6002 nb_of_query = len(query_dict) 6003 num_query = 0 6004 6005 # SET max_expression_depth TO x 6006 self.conn.execute("SET max_expression_depth TO 10000") 6007 6008 for query_name in query_dict: 6009 query = query_dict[query_name] 6010 num_query += 1 6011 log.info( 6012 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6013 ) 6014 result = self.conn.execute(query) 6015 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6016 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6017 log.info( 6018 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6019 ) 6020 6021 log.info( 6022 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6023 ) 6024 6025 else: 6026 6027 log.info( 6028 f"Annotation '{annotation_name}' - No Annotations available" 6029 ) 6030 6031 log.debug("Final header: " + str(vcf_reader.infos)) 6032 6033 # Remove added columns 6034 for added_column in added_columns: 6035 self.drop_column(column=added_column) 6036 6037 def annotation_splice(self, threads: int = None) -> None: 6038 """ 6039 This function annotate with snpEff 6040 6041 :param threads: The number of threads to use 6042 :return: the value of the variable "return_value". 6043 """ 6044 6045 # DEBUG 6046 log.debug("Start annotation with splice tools") 6047 6048 # Threads 6049 if not threads: 6050 threads = self.get_threads() 6051 log.debug("Threads: " + str(threads)) 6052 6053 # DEBUG 6054 delete_tmp = True 6055 if self.get_config().get("verbosity", "warning") in ["debug"]: 6056 delete_tmp = False 6057 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6058 6059 # Config 6060 config = self.get_config() 6061 log.debug("Config: " + str(config)) 6062 splice_config = config.get("tools", {}).get("splice", {}) 6063 if not splice_config: 6064 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6065 if not splice_config: 6066 msg_err = "No Splice tool config" 6067 log.error(msg_err) 6068 raise ValueError(msg_err) 6069 log.debug(f"splice_config={splice_config}") 6070 6071 # Config - Folders - Databases 6072 databases_folders = ( 6073 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6074 ) 6075 log.debug("Databases annotations: " + str(databases_folders)) 6076 6077 # Splice docker image 6078 splice_docker_image = splice_config.get("docker").get("image") 6079 6080 # Pull splice image if it's not already there 6081 if not check_docker_image_exists(splice_docker_image): 6082 log.warning( 6083 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6084 ) 6085 try: 6086 command(f"docker pull {splice_config.get('docker').get('image')}") 6087 except subprocess.CalledProcessError: 6088 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6089 log.error(msg_err) 6090 raise ValueError(msg_err) 6091 return None 6092 6093 # Config - splice databases 6094 splice_databases = ( 6095 config.get("folders", {}) 6096 .get("databases", {}) 6097 .get("splice", DEFAULT_SPLICE_FOLDER) 6098 ) 6099 splice_databases = full_path(splice_databases) 6100 6101 # Param 6102 param = self.get_param() 6103 log.debug("Param: " + str(param)) 6104 6105 # Param 6106 options = param.get("annotation", {}).get("splice", {}) 6107 log.debug("Options: " + str(options)) 6108 6109 # Data 6110 table_variants = self.get_table_variants() 6111 6112 # Check if not empty 6113 log.debug("Check if not empty") 6114 sql_query_chromosomes = ( 6115 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6116 ) 6117 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6118 log.info("VCF empty") 6119 return None 6120 6121 # Export in VCF 6122 log.debug("Create initial file to annotate") 6123 6124 # Create output folder 6125 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6126 if not os.path.exists(output_folder): 6127 Path(output_folder).mkdir(parents=True, exist_ok=True) 6128 6129 # Create tmp VCF file 6130 tmp_vcf = NamedTemporaryFile( 6131 prefix=self.get_prefix(), 6132 dir=output_folder, 6133 suffix=".vcf", 6134 delete=False, 6135 ) 6136 tmp_vcf_name = tmp_vcf.name 6137 6138 # VCF header 6139 header = self.get_header() 6140 6141 # Existing annotations 6142 for vcf_annotation in self.get_header().infos: 6143 6144 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6145 log.debug( 6146 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6147 ) 6148 6149 # Memory limit 6150 if config.get("memory", None): 6151 memory_limit = config.get("memory", "8G").upper() 6152 # upper() 6153 else: 6154 memory_limit = "8G" 6155 log.debug(f"memory_limit: {memory_limit}") 6156 6157 # Check number of variants to annotate 6158 where_clause_regex_spliceai = r"SpliceAI_\w+" 6159 where_clause_regex_spip = r"SPiP_\w+" 6160 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6161 df_list_of_variants_to_annotate = self.get_query_to_df( 6162 query=f""" SELECT * FROM variants {where_clause} """ 6163 ) 6164 if len(df_list_of_variants_to_annotate) == 0: 6165 log.warning( 6166 f"No variants to annotate with splice. Variants probably already annotated with splice" 6167 ) 6168 return None 6169 else: 6170 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6171 6172 # Export VCF file 6173 self.export_variant_vcf( 6174 vcf_file=tmp_vcf_name, 6175 remove_info=True, 6176 add_samples=True, 6177 index=False, 6178 where_clause=where_clause, 6179 ) 6180 6181 # Create docker container and launch splice analysis 6182 if splice_config: 6183 6184 # Splice mount folders 6185 mount_folders = splice_config.get("mount", {}) 6186 6187 # Genome mount 6188 mount_folders[ 6189 config.get("folders", {}) 6190 .get("databases", {}) 6191 .get("genomes", DEFAULT_GENOME_FOLDER) 6192 ] = "ro" 6193 6194 # SpliceAI mount 6195 mount_folders[ 6196 config.get("folders", {}) 6197 .get("databases", {}) 6198 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6199 ] = "ro" 6200 6201 # Genome mount 6202 mount_folders[ 6203 config.get("folders", {}) 6204 .get("databases", {}) 6205 .get("spip", DEFAULT_SPIP_FOLDER) 6206 ] = "ro" 6207 6208 # Mount folders 6209 mount = [] 6210 6211 # Config mount 6212 mount = [ 6213 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6214 for path, mode in mount_folders.items() 6215 ] 6216 6217 if any(value for value in splice_config.values() if value is None): 6218 log.warning("At least one splice config parameter is empty") 6219 return None 6220 6221 # Params in splice nf 6222 def check_values(dico: dict): 6223 """ 6224 Ensure parameters for NF splice pipeline 6225 """ 6226 for key, val in dico.items(): 6227 if key == "genome": 6228 if any( 6229 assemb in options.get("genome", {}) 6230 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6231 ): 6232 yield f"--{key} hg19" 6233 elif any( 6234 assemb in options.get("genome", {}) 6235 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6236 ): 6237 yield f"--{key} hg38" 6238 elif ( 6239 (isinstance(val, str) and val) 6240 or isinstance(val, int) 6241 or isinstance(val, bool) 6242 ): 6243 yield f"--{key} {val}" 6244 6245 # Genome 6246 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6247 options["genome"] = genome 6248 6249 # NF params 6250 nf_params = [] 6251 6252 # Add options 6253 if options: 6254 nf_params = list(check_values(options)) 6255 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6256 else: 6257 log.debug("No NF params provided") 6258 6259 # Add threads 6260 if "threads" not in options.keys(): 6261 nf_params.append(f"--threads {threads}") 6262 6263 # Genome path 6264 genome_path = find_genome( 6265 config.get("folders", {}) 6266 .get("databases", {}) 6267 .get("genomes", DEFAULT_GENOME_FOLDER), 6268 file=f"{genome}.fa", 6269 ) 6270 # Add genome path 6271 if not genome_path: 6272 raise ValueError( 6273 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6274 ) 6275 else: 6276 log.debug(f"Genome: {genome_path}") 6277 nf_params.append(f"--genome_path {genome_path}") 6278 6279 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6280 """ 6281 Setting up updated databases for SPiP and SpliceAI 6282 """ 6283 6284 try: 6285 6286 # SpliceAI assembly transcriptome 6287 spliceai_assembly = os.path.join( 6288 config.get("folders", {}) 6289 .get("databases", {}) 6290 .get("spliceai", {}), 6291 options.get("genome"), 6292 "transcriptome", 6293 ) 6294 spip_assembly = options.get("genome") 6295 6296 spip = find( 6297 f"transcriptome_{spip_assembly}.RData", 6298 config.get("folders", {}).get("databases", {}).get("spip", {}), 6299 ) 6300 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6301 log.debug(f"SPiP annotations: {spip}") 6302 log.debug(f"SpliceAI annotations: {spliceai}") 6303 if spip and spliceai: 6304 return [ 6305 f"--spip_transcriptome {spip}", 6306 f"--spliceai_annotations {spliceai}", 6307 ] 6308 else: 6309 # TODO crash and go on with basic annotations ? 6310 # raise ValueError( 6311 # "Can't find splice databases in configuration EXIT" 6312 # ) 6313 log.warning( 6314 "Can't find splice databases in configuration, use annotations file from image" 6315 ) 6316 except TypeError: 6317 log.warning( 6318 "Can't find splice databases in configuration, use annotations file from image" 6319 ) 6320 return [] 6321 6322 # Add options, check if transcriptome option have already beend provided 6323 if ( 6324 "spip_transcriptome" not in nf_params 6325 and "spliceai_transcriptome" not in nf_params 6326 ): 6327 splice_reference = splice_annotations(options, config) 6328 if splice_reference: 6329 nf_params.extend(splice_reference) 6330 6331 nf_params.append(f"--output_folder {output_folder}") 6332 6333 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6334 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6335 log.debug(cmd) 6336 6337 splice_config["docker"]["command"] = cmd 6338 6339 docker_cmd = get_bin_command( 6340 tool="splice", 6341 bin_type="docker", 6342 config=config, 6343 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6344 add_options=f"--name {random_uuid} {' '.join(mount)}", 6345 ) 6346 6347 # Docker debug 6348 # if splice_config.get("rm_container"): 6349 # rm_container = "--rm" 6350 # else: 6351 # rm_container = "" 6352 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6353 6354 log.debug(docker_cmd) 6355 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6356 log.debug(res.stdout) 6357 if res.stderr: 6358 log.error(res.stderr) 6359 res.check_returncode() 6360 else: 6361 log.warning(f"Splice tool configuration not found: {config}") 6362 6363 # Update variants 6364 log.info("Annotation - Updating...") 6365 # Test find output vcf 6366 log.debug( 6367 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6368 ) 6369 output_vcf = [] 6370 # Wrong folder to look in 6371 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6372 if ( 6373 files 6374 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6375 ): 6376 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6377 # log.debug(os.listdir(options.get("output_folder"))) 6378 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6379 if not output_vcf: 6380 log.debug( 6381 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6382 ) 6383 else: 6384 # Get new header from annotated vcf 6385 log.debug(f"Initial header: {len(header.infos)} fields") 6386 # Create new header with splice infos 6387 new_vcf = Variants(input=output_vcf[0]) 6388 new_vcf_header = new_vcf.get_header().infos 6389 for keys, infos in new_vcf_header.items(): 6390 if keys not in header.infos.keys(): 6391 header.infos[keys] = infos 6392 log.debug(f"New header: {len(header.infos)} fields") 6393 log.debug(f"Splice tmp output: {output_vcf[0]}") 6394 self.update_from_vcf(output_vcf[0]) 6395 6396 # Remove folder 6397 remove_if_exists(output_folder) 6398 6399 ### 6400 # Prioritization 6401 ### 6402 6403 def get_config_default(self, name: str) -> dict: 6404 """ 6405 The function `get_config_default` returns a dictionary containing default configurations for 6406 various calculations and prioritizations. 6407 6408 :param name: The `get_config_default` function returns a dictionary containing default 6409 configurations for different calculations and prioritizations. The `name` parameter is used to 6410 specify which specific configuration to retrieve from the dictionary 6411 :type name: str 6412 :return: The function `get_config_default` returns a dictionary containing default configuration 6413 settings for different calculations and prioritizations. The specific configuration settings are 6414 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6415 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6416 returned. If there is no match, an empty dictionary is returned. 6417 """ 6418 6419 config_default = { 6420 "calculations": { 6421 "variant_chr_pos_alt_ref": { 6422 "type": "sql", 6423 "name": "variant_chr_pos_alt_ref", 6424 "description": "Create a variant ID with chromosome, position, alt and ref", 6425 "available": False, 6426 "output_column_name": "variant_chr_pos_alt_ref", 6427 "output_column_type": "String", 6428 "output_column_description": "variant ID with chromosome, position, alt and ref", 6429 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6430 "operation_info": True, 6431 }, 6432 "VARTYPE": { 6433 "type": "sql", 6434 "name": "VARTYPE", 6435 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6436 "available": True, 6437 "output_column_name": "VARTYPE", 6438 "output_column_type": "String", 6439 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6440 "operation_query": """ 6441 CASE 6442 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6443 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6444 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6445 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6446 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6447 ELSE 'UNDEFINED' 6448 END 6449 """, 6450 "info_fields": ["SVTYPE"], 6451 "operation_info": True, 6452 }, 6453 "snpeff_hgvs": { 6454 "type": "python", 6455 "name": "snpeff_hgvs", 6456 "description": "HGVS nomenclatures from snpEff annotation", 6457 "available": True, 6458 "function_name": "calculation_extract_snpeff_hgvs", 6459 "function_params": ["snpeff_hgvs", "ANN"], 6460 }, 6461 "snpeff_ann_explode": { 6462 "type": "python", 6463 "name": "snpeff_ann_explode", 6464 "description": "Explode snpEff annotations with uniquify values", 6465 "available": True, 6466 "function_name": "calculation_snpeff_ann_explode", 6467 "function_params": [False, "fields", "snpeff_", "ANN"], 6468 }, 6469 "snpeff_ann_explode_uniquify": { 6470 "type": "python", 6471 "name": "snpeff_ann_explode_uniquify", 6472 "description": "Explode snpEff annotations", 6473 "available": True, 6474 "function_name": "calculation_snpeff_ann_explode", 6475 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6476 }, 6477 "snpeff_ann_explode_json": { 6478 "type": "python", 6479 "name": "snpeff_ann_explode_json", 6480 "description": "Explode snpEff annotations in JSON format", 6481 "available": True, 6482 "function_name": "calculation_snpeff_ann_explode", 6483 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6484 }, 6485 "NOMEN": { 6486 "type": "python", 6487 "name": "NOMEN", 6488 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6489 "available": True, 6490 "function_name": "calculation_extract_nomen", 6491 "function_params": [], 6492 }, 6493 "FINDBYPIPELINE": { 6494 "type": "python", 6495 "name": "FINDBYPIPELINE", 6496 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6497 "available": True, 6498 "function_name": "calculation_find_by_pipeline", 6499 "function_params": ["findbypipeline"], 6500 }, 6501 "FINDBYSAMPLE": { 6502 "type": "python", 6503 "name": "FINDBYSAMPLE", 6504 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6505 "available": True, 6506 "function_name": "calculation_find_by_pipeline", 6507 "function_params": ["findbysample"], 6508 }, 6509 "GENOTYPECONCORDANCE": { 6510 "type": "python", 6511 "name": "GENOTYPECONCORDANCE", 6512 "description": "Concordance of genotype for multi caller VCF", 6513 "available": True, 6514 "function_name": "calculation_genotype_concordance", 6515 "function_params": [], 6516 }, 6517 "BARCODE": { 6518 "type": "python", 6519 "name": "BARCODE", 6520 "description": "BARCODE as VaRank tool", 6521 "available": True, 6522 "function_name": "calculation_barcode", 6523 "function_params": [], 6524 }, 6525 "BARCODEFAMILY": { 6526 "type": "python", 6527 "name": "BARCODEFAMILY", 6528 "description": "BARCODEFAMILY as VaRank tool", 6529 "available": True, 6530 "function_name": "calculation_barcode_family", 6531 "function_params": ["BCF"], 6532 }, 6533 "TRIO": { 6534 "type": "python", 6535 "name": "TRIO", 6536 "description": "Inheritance for a trio family", 6537 "available": True, 6538 "function_name": "calculation_trio", 6539 "function_params": [], 6540 }, 6541 "VAF": { 6542 "type": "python", 6543 "name": "VAF", 6544 "description": "Variant Allele Frequency (VAF) harmonization", 6545 "available": True, 6546 "function_name": "calculation_vaf_normalization", 6547 "function_params": [], 6548 }, 6549 "VAF_stats": { 6550 "type": "python", 6551 "name": "VAF_stats", 6552 "description": "Variant Allele Frequency (VAF) statistics", 6553 "available": True, 6554 "function_name": "calculation_genotype_stats", 6555 "function_params": ["VAF"], 6556 }, 6557 "DP_stats": { 6558 "type": "python", 6559 "name": "DP_stats", 6560 "description": "Depth (DP) statistics", 6561 "available": True, 6562 "function_name": "calculation_genotype_stats", 6563 "function_params": ["DP"], 6564 }, 6565 "variant_id": { 6566 "type": "python", 6567 "name": "variant_id", 6568 "description": "Variant ID generated from variant position and type", 6569 "available": True, 6570 "function_name": "calculation_variant_id", 6571 "function_params": [], 6572 }, 6573 "transcripts_json": { 6574 "type": "python", 6575 "name": "transcripts_json", 6576 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6577 "available": True, 6578 "function_name": "calculation_transcripts_annotation", 6579 "function_params": ["transcripts_json", None], 6580 }, 6581 "transcripts_ann": { 6582 "type": "python", 6583 "name": "transcripts_ann", 6584 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6585 "available": True, 6586 "function_name": "calculation_transcripts_annotation", 6587 "function_params": [None, "transcripts_ann"], 6588 }, 6589 "transcripts_annotations": { 6590 "type": "python", 6591 "name": "transcripts_annotations", 6592 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6593 "available": True, 6594 "function_name": "calculation_transcripts_annotation", 6595 "function_params": [None, None], 6596 }, 6597 "transcripts_prioritization": { 6598 "type": "python", 6599 "name": "transcripts_prioritization", 6600 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6601 "available": True, 6602 "function_name": "calculation_transcripts_prioritization", 6603 "function_params": [], 6604 }, 6605 }, 6606 "prioritizations": { 6607 "default": { 6608 "ANN2": [ 6609 { 6610 "type": "contains", 6611 "value": "HIGH", 6612 "score": 5, 6613 "flag": "PASS", 6614 "comment": [ 6615 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6616 ], 6617 }, 6618 { 6619 "type": "contains", 6620 "value": "MODERATE", 6621 "score": 3, 6622 "flag": "PASS", 6623 "comment": [ 6624 "A non-disruptive variant that might change protein effectiveness" 6625 ], 6626 }, 6627 { 6628 "type": "contains", 6629 "value": "LOW", 6630 "score": 0, 6631 "flag": "FILTERED", 6632 "comment": [ 6633 "Assumed to be mostly harmless or unlikely to change protein behavior" 6634 ], 6635 }, 6636 { 6637 "type": "contains", 6638 "value": "MODIFIER", 6639 "score": 0, 6640 "flag": "FILTERED", 6641 "comment": [ 6642 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6643 ], 6644 }, 6645 ], 6646 } 6647 }, 6648 } 6649 6650 return config_default.get(name, None) 6651 6652 def get_config_json( 6653 self, name: str, config_dict: dict = {}, config_file: str = None 6654 ) -> dict: 6655 """ 6656 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6657 default values, a dictionary, and a file. 6658 6659 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6660 the name of the configuration. It is used to identify and retrieve the configuration settings 6661 for a specific component or module 6662 :type name: str 6663 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6664 dictionary that allows you to provide additional configuration settings or overrides. When you 6665 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6666 the key is the configuration setting you want to override or 6667 :type config_dict: dict 6668 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6669 specify the path to a configuration file that contains additional settings. If provided, the 6670 function will read the contents of this file and update the configuration dictionary with the 6671 values found in the file, overriding any existing values with the 6672 :type config_file: str 6673 :return: The function `get_config_json` returns a dictionary containing the configuration 6674 settings. 6675 """ 6676 6677 # Create with default prioritizations 6678 config_default = self.get_config_default(name=name) 6679 configuration = config_default 6680 # log.debug(f"configuration={configuration}") 6681 6682 # Replace prioritizations from dict 6683 for config in config_dict: 6684 configuration[config] = config_dict[config] 6685 6686 # Replace prioritizations from file 6687 config_file = full_path(config_file) 6688 if config_file: 6689 if os.path.exists(config_file): 6690 with open(config_file) as config_file_content: 6691 config_file_dict = json.load(config_file_content) 6692 for config in config_file_dict: 6693 configuration[config] = config_file_dict[config] 6694 else: 6695 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6696 log.error(msg_error) 6697 raise ValueError(msg_error) 6698 6699 return configuration 6700 6701 def prioritization( 6702 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6703 ) -> bool: 6704 """ 6705 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6706 prioritizes variants based on configured profiles and criteria. 6707 6708 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6709 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6710 a table name is provided, the method will prioritize the variants in that specific table 6711 :type table: str 6712 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6713 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6714 provided, the code will use a default prefix value of "PZ" 6715 :type pz_prefix: str 6716 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6717 additional parameters specific to the prioritization process. These parameters can include 6718 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6719 configurations needed for the prioritization of variants in a V 6720 :type pz_param: dict 6721 :return: A boolean value (True) is being returned from the `prioritization` function. 6722 """ 6723 6724 # Config 6725 config = self.get_config() 6726 6727 # Param 6728 param = self.get_param() 6729 6730 # Prioritization param 6731 if pz_param is not None: 6732 prioritization_param = pz_param 6733 else: 6734 prioritization_param = param.get("prioritization", {}) 6735 6736 # Configuration profiles 6737 prioritization_config_file = prioritization_param.get( 6738 "prioritization_config", None 6739 ) 6740 prioritization_config_file = full_path(prioritization_config_file) 6741 prioritizations_config = self.get_config_json( 6742 name="prioritizations", config_file=prioritization_config_file 6743 ) 6744 6745 # Prioritization prefix 6746 pz_prefix_default = "PZ" 6747 if pz_prefix is None: 6748 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6749 6750 # Prioritization options 6751 profiles = prioritization_param.get("profiles", []) 6752 if isinstance(profiles, str): 6753 profiles = profiles.split(",") 6754 pzfields = prioritization_param.get( 6755 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6756 ) 6757 if isinstance(pzfields, str): 6758 pzfields = pzfields.split(",") 6759 default_profile = prioritization_param.get("default_profile", None) 6760 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6761 prioritization_score_mode = prioritization_param.get( 6762 "prioritization_score_mode", "HOWARD" 6763 ) 6764 6765 # Quick Prioritizations 6766 prioritizations = param.get("prioritizations", None) 6767 if prioritizations: 6768 log.info("Quick Prioritization:") 6769 for profile in prioritizations.split(","): 6770 if profile not in profiles: 6771 profiles.append(profile) 6772 log.info(f" {profile}") 6773 6774 # If profile "ALL" provided, all profiles in the config profiles 6775 if "ALL" in profiles: 6776 profiles = list(prioritizations_config.keys()) 6777 6778 for profile in profiles: 6779 if prioritizations_config.get(profile, None): 6780 log.debug(f"Profile '{profile}' configured") 6781 else: 6782 msg_error = f"Profile '{profile}' NOT configured" 6783 log.error(msg_error) 6784 raise ValueError(msg_error) 6785 6786 if profiles: 6787 log.info(f"Prioritization... ") 6788 else: 6789 log.debug(f"No profile defined") 6790 return False 6791 6792 if not default_profile and len(profiles): 6793 default_profile = profiles[0] 6794 6795 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6796 log.debug("Profiles to check: " + str(list(profiles))) 6797 6798 # Variables 6799 if table is not None: 6800 table_variants = table 6801 else: 6802 table_variants = self.get_table_variants(clause="update") 6803 log.debug(f"Table to prioritize: {table_variants}") 6804 6805 # Added columns 6806 added_columns = [] 6807 6808 # Create list of PZfields 6809 # List of PZFields 6810 list_of_pzfields_original = pzfields + [ 6811 pzfield + pzfields_sep + profile 6812 for pzfield in pzfields 6813 for profile in profiles 6814 ] 6815 list_of_pzfields = [] 6816 log.debug(f"{list_of_pzfields_original}") 6817 6818 # Remove existing PZfields to use if exists 6819 for pzfield in list_of_pzfields_original: 6820 if self.get_header().infos.get(pzfield, None) is None: 6821 list_of_pzfields.append(pzfield) 6822 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6823 else: 6824 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6825 6826 if list_of_pzfields: 6827 6828 # Explode Infos prefix 6829 explode_infos_prefix = self.get_explode_infos_prefix() 6830 6831 # PZfields tags description 6832 PZfields_INFOS = { 6833 f"{pz_prefix}Tags": { 6834 "ID": f"{pz_prefix}Tags", 6835 "Number": ".", 6836 "Type": "String", 6837 "Description": "Variant tags based on annotation criteria", 6838 }, 6839 f"{pz_prefix}Score": { 6840 "ID": f"{pz_prefix}Score", 6841 "Number": 1, 6842 "Type": "Integer", 6843 "Description": "Variant score based on annotation criteria", 6844 }, 6845 f"{pz_prefix}Flag": { 6846 "ID": f"{pz_prefix}Flag", 6847 "Number": 1, 6848 "Type": "String", 6849 "Description": "Variant flag based on annotation criteria", 6850 }, 6851 f"{pz_prefix}Comment": { 6852 "ID": f"{pz_prefix}Comment", 6853 "Number": ".", 6854 "Type": "String", 6855 "Description": "Variant comment based on annotation criteria", 6856 }, 6857 f"{pz_prefix}Infos": { 6858 "ID": f"{pz_prefix}Infos", 6859 "Number": ".", 6860 "Type": "String", 6861 "Description": "Variant infos based on annotation criteria", 6862 }, 6863 f"{pz_prefix}Class": { 6864 "ID": f"{pz_prefix}Class", 6865 "Number": ".", 6866 "Type": "String", 6867 "Description": "Variant class based on annotation criteria", 6868 }, 6869 } 6870 6871 # Create INFO fields if not exist 6872 for field in PZfields_INFOS: 6873 field_ID = PZfields_INFOS[field]["ID"] 6874 field_description = PZfields_INFOS[field]["Description"] 6875 if field_ID not in self.get_header().infos and field_ID in pzfields: 6876 field_description = ( 6877 PZfields_INFOS[field]["Description"] 6878 + f", profile {default_profile}" 6879 ) 6880 self.get_header().infos[field_ID] = vcf.parser._Info( 6881 field_ID, 6882 PZfields_INFOS[field]["Number"], 6883 PZfields_INFOS[field]["Type"], 6884 field_description, 6885 "unknown", 6886 "unknown", 6887 code_type_map[PZfields_INFOS[field]["Type"]], 6888 ) 6889 6890 # Create INFO fields if not exist for each profile 6891 for profile in prioritizations_config: 6892 if profile in profiles or profiles == []: 6893 for field in PZfields_INFOS: 6894 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6895 field_description = ( 6896 PZfields_INFOS[field]["Description"] 6897 + f", profile {profile}" 6898 ) 6899 if ( 6900 field_ID not in self.get_header().infos 6901 and field in pzfields 6902 ): 6903 self.get_header().infos[field_ID] = vcf.parser._Info( 6904 field_ID, 6905 PZfields_INFOS[field]["Number"], 6906 PZfields_INFOS[field]["Type"], 6907 field_description, 6908 "unknown", 6909 "unknown", 6910 code_type_map[PZfields_INFOS[field]["Type"]], 6911 ) 6912 6913 # Header 6914 for pzfield in list_of_pzfields: 6915 if re.match(f"{pz_prefix}Score.*", pzfield): 6916 added_column = self.add_column( 6917 table_name=table_variants, 6918 column_name=pzfield, 6919 column_type="INTEGER", 6920 default_value="0", 6921 ) 6922 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6923 added_column = self.add_column( 6924 table_name=table_variants, 6925 column_name=pzfield, 6926 column_type="BOOLEAN", 6927 default_value="1", 6928 ) 6929 elif re.match(f"{pz_prefix}Class.*", pzfield): 6930 added_column = self.add_column( 6931 table_name=table_variants, 6932 column_name=pzfield, 6933 column_type="VARCHAR[]", 6934 default_value="null", 6935 ) 6936 else: 6937 added_column = self.add_column( 6938 table_name=table_variants, 6939 column_name=pzfield, 6940 column_type="STRING", 6941 default_value="''", 6942 ) 6943 added_columns.append(added_column) 6944 6945 # Profiles 6946 if profiles: 6947 6948 # foreach profile in configuration file 6949 for profile in prioritizations_config: 6950 6951 # If profile is asked in param, or ALL are asked (empty profile []) 6952 if profile in profiles or profiles == []: 6953 log.info(f"Profile '{profile}'") 6954 6955 sql_set_info_option = "" 6956 6957 sql_set_info = [] 6958 6959 # PZ fields set 6960 6961 # PZScore 6962 if ( 6963 f"{pz_prefix}Score{pzfields_sep}{profile}" 6964 in list_of_pzfields 6965 ): 6966 sql_set_info.append( 6967 f""" 6968 concat( 6969 '{pz_prefix}Score{pzfields_sep}{profile}=', 6970 {pz_prefix}Score{pzfields_sep}{profile} 6971 ) 6972 """ 6973 ) 6974 if ( 6975 profile == default_profile 6976 and f"{pz_prefix}Score" in list_of_pzfields 6977 ): 6978 sql_set_info.append( 6979 f""" 6980 concat( 6981 '{pz_prefix}Score=', 6982 {pz_prefix}Score{pzfields_sep}{profile} 6983 ) 6984 """ 6985 ) 6986 6987 # PZFlag 6988 if ( 6989 f"{pz_prefix}Flag{pzfields_sep}{profile}" 6990 in list_of_pzfields 6991 ): 6992 sql_set_info.append( 6993 f""" 6994 concat( 6995 '{pz_prefix}Flag{pzfields_sep}{profile}=', 6996 CASE 6997 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6998 THEN 'PASS' 6999 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7000 THEN 'FILTERED' 7001 END 7002 ) 7003 """ 7004 ) 7005 if ( 7006 profile == default_profile 7007 and f"{pz_prefix}Flag" in list_of_pzfields 7008 ): 7009 sql_set_info.append( 7010 f""" 7011 concat( 7012 '{pz_prefix}Flag=', 7013 CASE 7014 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7015 THEN 'PASS' 7016 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7017 THEN 'FILTERED' 7018 END 7019 ) 7020 """ 7021 ) 7022 7023 # PZClass 7024 if ( 7025 f"{pz_prefix}Class{pzfields_sep}{profile}" 7026 in list_of_pzfields 7027 ): 7028 sql_set_info.append( 7029 f""" 7030 concat( 7031 '{pz_prefix}Class{pzfields_sep}{profile}=', 7032 CASE 7033 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7034 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7035 ELSE '.' 7036 END 7037 ) 7038 7039 """ 7040 ) 7041 if ( 7042 profile == default_profile 7043 and f"{pz_prefix}Class" in list_of_pzfields 7044 ): 7045 sql_set_info.append( 7046 f""" 7047 concat( 7048 '{pz_prefix}Class=', 7049 CASE 7050 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7051 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7052 ELSE '.' 7053 END 7054 ) 7055 """ 7056 ) 7057 7058 # PZComment 7059 if ( 7060 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7061 in list_of_pzfields 7062 ): 7063 sql_set_info.append( 7064 f""" 7065 CASE 7066 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7067 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7068 ELSE '' 7069 END 7070 """ 7071 ) 7072 if ( 7073 profile == default_profile 7074 and f"{pz_prefix}Comment" in list_of_pzfields 7075 ): 7076 sql_set_info.append( 7077 f""" 7078 CASE 7079 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7080 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7081 ELSE '' 7082 END 7083 """ 7084 ) 7085 7086 # PZInfos 7087 if ( 7088 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7089 in list_of_pzfields 7090 ): 7091 sql_set_info.append( 7092 f""" 7093 CASE 7094 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7095 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7096 ELSE '' 7097 END 7098 """ 7099 ) 7100 if ( 7101 profile == default_profile 7102 and f"{pz_prefix}Infos" in list_of_pzfields 7103 ): 7104 sql_set_info.append( 7105 f""" 7106 CASE 7107 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7108 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7109 ELSE '' 7110 END 7111 """ 7112 ) 7113 7114 # Merge PZfields 7115 sql_set_info_option = "" 7116 sql_set_sep = "" 7117 for sql_set in sql_set_info: 7118 if sql_set_sep: 7119 sql_set_info_option += f""" 7120 , concat('{sql_set_sep}', {sql_set}) 7121 """ 7122 else: 7123 sql_set_info_option += f""" 7124 , {sql_set} 7125 """ 7126 sql_set_sep = ";" 7127 7128 sql_queries = [] 7129 for annotation in prioritizations_config[profile]: 7130 7131 # skip special sections 7132 if annotation.startswith("_"): 7133 continue 7134 7135 # For each criterions 7136 for criterion in prioritizations_config[profile][ 7137 annotation 7138 ]: 7139 7140 # Criterion mode 7141 criterion_mode = None 7142 if np.any( 7143 np.isin(list(criterion.keys()), ["type", "value"]) 7144 ): 7145 criterion_mode = "operation" 7146 elif np.any( 7147 np.isin(list(criterion.keys()), ["sql", "fields"]) 7148 ): 7149 criterion_mode = "sql" 7150 log.debug(f"Criterion Mode: {criterion_mode}") 7151 7152 # Criterion parameters 7153 criterion_type = criterion.get("type", None) 7154 criterion_value = criterion.get("value", None) 7155 criterion_sql = criterion.get("sql", None) 7156 criterion_fields = criterion.get("fields", None) 7157 criterion_score = criterion.get("score", 0) 7158 criterion_flag = criterion.get("flag", "PASS") 7159 criterion_class = criterion.get("class", None) 7160 criterion_flag_bool = criterion_flag == "PASS" 7161 criterion_comment = ( 7162 ", ".join(criterion.get("comment", [])) 7163 .replace("'", "''") 7164 .replace(";", ",") 7165 .replace("\t", " ") 7166 ) 7167 criterion_infos = ( 7168 str(criterion) 7169 .replace("'", "''") 7170 .replace(";", ",") 7171 .replace("\t", " ") 7172 ) 7173 7174 # SQL 7175 if criterion_sql is not None and isinstance( 7176 criterion_sql, list 7177 ): 7178 criterion_sql = " ".join(criterion_sql) 7179 7180 # Fields and explode 7181 if criterion_fields is None: 7182 criterion_fields = [annotation] 7183 if not isinstance(criterion_fields, list): 7184 criterion_fields = str(criterion_fields).split(",") 7185 7186 # Class 7187 if criterion_class is not None and not isinstance( 7188 criterion_class, list 7189 ): 7190 criterion_class = str(criterion_class).split(",") 7191 7192 for annotation_field in criterion_fields: 7193 7194 # Explode specific annotation 7195 log.debug( 7196 f"Explode annotation '{annotation_field}'" 7197 ) 7198 added_columns += self.explode_infos( 7199 prefix=explode_infos_prefix, 7200 fields=[annotation_field], 7201 table=table_variants, 7202 ) 7203 extra_infos = self.get_extra_infos( 7204 table=table_variants 7205 ) 7206 7207 # Check if annotation field is present 7208 if ( 7209 f"{explode_infos_prefix}{annotation_field}" 7210 not in extra_infos 7211 ): 7212 msq_err = f"Annotation '{annotation_field}' not in data" 7213 log.error(msq_err) 7214 raise ValueError(msq_err) 7215 else: 7216 log.debug( 7217 f"Annotation '{annotation_field}' in data" 7218 ) 7219 7220 sql_set = [] 7221 sql_set_info = [] 7222 7223 # PZ fields set 7224 7225 # PZScore 7226 if ( 7227 f"{pz_prefix}Score{pzfields_sep}{profile}" 7228 in list_of_pzfields 7229 ): 7230 # if prioritization_score_mode == "HOWARD": 7231 # sql_set.append( 7232 # f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7233 # ) 7234 # VaRank prioritization score mode 7235 if prioritization_score_mode == "VaRank": 7236 sql_set.append( 7237 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7238 ) 7239 # default HOWARD prioritization score mode 7240 else: 7241 sql_set.append( 7242 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7243 ) 7244 7245 # PZFlag 7246 if ( 7247 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7248 in list_of_pzfields 7249 ): 7250 sql_set.append( 7251 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7252 ) 7253 7254 # PZClass 7255 if ( 7256 f"{pz_prefix}Class{pzfields_sep}{profile}" 7257 in list_of_pzfields 7258 and criterion_class is not None 7259 ): 7260 sql_set.append( 7261 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7262 ) 7263 7264 # PZComment 7265 if ( 7266 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7267 in list_of_pzfields 7268 ): 7269 sql_set.append( 7270 f""" 7271 {pz_prefix}Comment{pzfields_sep}{profile} = 7272 concat( 7273 {pz_prefix}Comment{pzfields_sep}{profile}, 7274 CASE 7275 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7276 THEN ', ' 7277 ELSE '' 7278 END, 7279 '{criterion_comment}' 7280 ) 7281 """ 7282 ) 7283 7284 # PZInfos 7285 if ( 7286 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7287 in list_of_pzfields 7288 ): 7289 sql_set.append( 7290 f""" 7291 {pz_prefix}Infos{pzfields_sep}{profile} = 7292 concat( 7293 {pz_prefix}Infos{pzfields_sep}{profile}, 7294 '{criterion_infos}' 7295 ) 7296 """ 7297 ) 7298 sql_set_option = ",".join(sql_set) 7299 7300 # Criterion and comparison 7301 if sql_set_option: 7302 7303 if criterion_mode in ["operation"]: 7304 7305 try: 7306 float(criterion_value) 7307 sql_update = f""" 7308 UPDATE {table_variants} 7309 SET {sql_set_option} 7310 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7311 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7312 """ 7313 except: 7314 contains_option = "" 7315 if criterion_type == "contains": 7316 contains_option = ".*" 7317 sql_update = f""" 7318 UPDATE {table_variants} 7319 SET {sql_set_option} 7320 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7321 """ 7322 sql_queries.append(sql_update) 7323 7324 elif criterion_mode in ["sql"]: 7325 7326 sql_update = f""" 7327 UPDATE {table_variants} 7328 SET {sql_set_option} 7329 WHERE {criterion_sql} 7330 """ 7331 sql_queries.append(sql_update) 7332 7333 else: 7334 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7335 log.error(msg_err) 7336 raise ValueError(msg_err) 7337 7338 else: 7339 log.warning( 7340 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7341 ) 7342 7343 # PZTags 7344 if ( 7345 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7346 in list_of_pzfields 7347 ): 7348 7349 # Create PZFalgs value 7350 pztags_value = "" 7351 pztags_sep_default = "," 7352 pztags_sep = "" 7353 for pzfield in pzfields: 7354 if pzfield not in [f"{pz_prefix}Tags"]: 7355 if ( 7356 f"{pzfield}{pzfields_sep}{profile}" 7357 in list_of_pzfields 7358 ): 7359 if pzfield in [f"{pz_prefix}Flag"]: 7360 pztags_value += f"""{pztags_sep}{pzfield}#', 7361 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7362 THEN 'PASS' 7363 ELSE 'FILTERED' 7364 END, '""" 7365 elif pzfield in [f"{pz_prefix}Class"]: 7366 pztags_value += f"""{pztags_sep}{pzfield}#', 7367 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7368 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7369 ELSE '.' 7370 END, '""" 7371 else: 7372 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7373 pztags_sep = pztags_sep_default 7374 7375 # Add Query update for PZFlags 7376 sql_update_pztags = f""" 7377 UPDATE {table_variants} 7378 SET INFO = concat( 7379 INFO, 7380 CASE WHEN INFO NOT in ('','.') 7381 THEN ';' 7382 ELSE '' 7383 END, 7384 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7385 ) 7386 """ 7387 sql_queries.append(sql_update_pztags) 7388 7389 # Add Query update for PZFlags for default 7390 if profile == default_profile: 7391 sql_update_pztags_default = f""" 7392 UPDATE {table_variants} 7393 SET INFO = concat( 7394 INFO, 7395 ';', 7396 '{pz_prefix}Tags={pztags_value}' 7397 ) 7398 """ 7399 sql_queries.append(sql_update_pztags_default) 7400 7401 log.info(f"""Profile '{profile}' - Prioritization... """) 7402 7403 if sql_queries: 7404 7405 for sql_query in sql_queries: 7406 log.debug( 7407 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7408 ) 7409 self.conn.execute(sql_query) 7410 7411 log.info(f"""Profile '{profile}' - Update... """) 7412 sql_query_update = f""" 7413 UPDATE {table_variants} 7414 SET INFO = 7415 concat( 7416 CASE 7417 WHEN INFO NOT IN ('','.') 7418 THEN concat(INFO, ';') 7419 ELSE '' 7420 END 7421 {sql_set_info_option} 7422 ) 7423 """ 7424 self.conn.execute(sql_query_update) 7425 7426 else: 7427 7428 log.warning(f"No profiles in parameters") 7429 7430 # Remove added columns 7431 for added_column in added_columns: 7432 self.drop_column(column=added_column) 7433 7434 # Explode INFOS fields into table fields 7435 if self.get_explode_infos(): 7436 self.explode_infos( 7437 prefix=self.get_explode_infos_prefix(), 7438 fields=self.get_explode_infos_fields(), 7439 force=True, 7440 ) 7441 7442 return True 7443 7444 ### 7445 # HGVS 7446 ### 7447 7448 def annotation_hgvs(self, threads: int = None) -> None: 7449 """ 7450 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7451 coordinates and alleles. 7452 7453 :param threads: The `threads` parameter is an optional integer that specifies the number of 7454 threads to use for parallel processing. If no value is provided, it will default to the number 7455 of threads obtained from the `get_threads()` method 7456 :type threads: int 7457 """ 7458 7459 # Function for each partition of the Dask Dataframe 7460 def partition_function(partition): 7461 """ 7462 The function `partition_function` applies the `annotation_hgvs_partition` function to 7463 each row of a DataFrame called `partition`. 7464 7465 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7466 to be processed 7467 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7468 the "partition" dataframe along the axis 1. 7469 """ 7470 return partition.apply(annotation_hgvs_partition, axis=1) 7471 7472 def annotation_hgvs_partition(row) -> str: 7473 """ 7474 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7475 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7476 7477 :param row: A dictionary-like object that contains the values for the following keys: 7478 :return: a string that contains the HGVS names associated with the given row of data. 7479 """ 7480 7481 chr = row["CHROM"] 7482 pos = row["POS"] 7483 ref = row["REF"] 7484 alt = row["ALT"] 7485 7486 # Find list of associated transcripts 7487 transcripts_list = list( 7488 polars_conn.execute( 7489 f""" 7490 SELECT transcript 7491 FROM refseq_df 7492 WHERE CHROM='{chr}' 7493 AND POS={pos} 7494 """ 7495 )["transcript"] 7496 ) 7497 7498 # Full HGVS annotation in list 7499 hgvs_full_list = [] 7500 7501 for transcript_name in transcripts_list: 7502 7503 # Transcript 7504 transcript = get_transcript( 7505 transcripts=transcripts, transcript_name=transcript_name 7506 ) 7507 # Exon 7508 if use_exon: 7509 exon = transcript.find_exon_number(pos) 7510 else: 7511 exon = None 7512 # Protein 7513 transcript_protein = None 7514 if use_protein or add_protein or full_format: 7515 transcripts_protein = list( 7516 polars_conn.execute( 7517 f""" 7518 SELECT protein 7519 FROM refseqlink_df 7520 WHERE transcript='{transcript_name}' 7521 LIMIT 1 7522 """ 7523 )["protein"] 7524 ) 7525 if len(transcripts_protein): 7526 transcript_protein = transcripts_protein[0] 7527 7528 # HGVS name 7529 hgvs_name = format_hgvs_name( 7530 chr, 7531 pos, 7532 ref, 7533 alt, 7534 genome=genome, 7535 transcript=transcript, 7536 transcript_protein=transcript_protein, 7537 exon=exon, 7538 use_gene=use_gene, 7539 use_protein=use_protein, 7540 full_format=full_format, 7541 use_version=use_version, 7542 codon_type=codon_type, 7543 ) 7544 hgvs_full_list.append(hgvs_name) 7545 if add_protein and not use_protein and not full_format: 7546 hgvs_name = format_hgvs_name( 7547 chr, 7548 pos, 7549 ref, 7550 alt, 7551 genome=genome, 7552 transcript=transcript, 7553 transcript_protein=transcript_protein, 7554 exon=exon, 7555 use_gene=use_gene, 7556 use_protein=True, 7557 full_format=False, 7558 use_version=use_version, 7559 codon_type=codon_type, 7560 ) 7561 hgvs_full_list.append(hgvs_name) 7562 7563 # Create liste of HGVS annotations 7564 hgvs_full = ",".join(hgvs_full_list) 7565 7566 return hgvs_full 7567 7568 # Polars connexion 7569 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7570 7571 # Config 7572 config = self.get_config() 7573 7574 # Databases 7575 # Genome 7576 databases_genomes_folders = ( 7577 config.get("folders", {}) 7578 .get("databases", {}) 7579 .get("genomes", DEFAULT_GENOME_FOLDER) 7580 ) 7581 databases_genome = ( 7582 config.get("folders", {}).get("databases", {}).get("genomes", "") 7583 ) 7584 # refseq database folder 7585 databases_refseq_folders = ( 7586 config.get("folders", {}) 7587 .get("databases", {}) 7588 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7589 ) 7590 # refseq 7591 databases_refseq = config.get("databases", {}).get("refSeq", None) 7592 # refSeqLink 7593 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7594 7595 # Param 7596 param = self.get_param() 7597 7598 # Quick HGVS 7599 if "hgvs_options" in param and param.get("hgvs_options", ""): 7600 log.info(f"Quick HGVS Annotation:") 7601 if not param.get("hgvs", None): 7602 param["hgvs"] = {} 7603 for option in param.get("hgvs_options", "").split(","): 7604 option_var_val = option.split("=") 7605 option_var = option_var_val[0] 7606 if len(option_var_val) > 1: 7607 option_val = option_var_val[1] 7608 else: 7609 option_val = "True" 7610 if option_val.upper() in ["TRUE"]: 7611 option_val = True 7612 elif option_val.upper() in ["FALSE"]: 7613 option_val = False 7614 log.info(f" {option_var}={option_val}") 7615 param["hgvs"][option_var] = option_val 7616 7617 # Check if HGVS annotation enabled 7618 if "hgvs" in param: 7619 log.info(f"HGVS Annotation... ") 7620 for hgvs_option in param.get("hgvs", {}): 7621 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7622 else: 7623 return 7624 7625 # HGVS Param 7626 param_hgvs = param.get("hgvs", {}) 7627 use_exon = param_hgvs.get("use_exon", False) 7628 use_gene = param_hgvs.get("use_gene", False) 7629 use_protein = param_hgvs.get("use_protein", False) 7630 add_protein = param_hgvs.get("add_protein", False) 7631 full_format = param_hgvs.get("full_format", False) 7632 use_version = param_hgvs.get("use_version", False) 7633 codon_type = param_hgvs.get("codon_type", "3") 7634 7635 # refSseq refSeqLink 7636 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7637 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7638 7639 # Assembly 7640 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7641 7642 # Genome 7643 genome_file = None 7644 if find_genome(databases_genome): 7645 genome_file = find_genome(databases_genome) 7646 else: 7647 genome_file = find_genome( 7648 genome_path=databases_genomes_folders, assembly=assembly 7649 ) 7650 log.debug("Genome: " + str(genome_file)) 7651 7652 # refSseq 7653 refseq_file = find_file_prefix( 7654 input_file=databases_refseq, 7655 prefix="ncbiRefSeq", 7656 folder=databases_refseq_folders, 7657 assembly=assembly, 7658 ) 7659 log.debug("refSeq: " + str(refseq_file)) 7660 7661 # refSeqLink 7662 refseqlink_file = find_file_prefix( 7663 input_file=databases_refseqlink, 7664 prefix="ncbiRefSeqLink", 7665 folder=databases_refseq_folders, 7666 assembly=assembly, 7667 ) 7668 log.debug("refSeqLink: " + str(refseqlink_file)) 7669 7670 # Threads 7671 if not threads: 7672 threads = self.get_threads() 7673 log.debug("Threads: " + str(threads)) 7674 7675 # Variables 7676 table_variants = self.get_table_variants(clause="update") 7677 7678 # Get variants SNV and InDel only 7679 query_variants = f""" 7680 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7681 FROM {table_variants} 7682 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7683 """ 7684 df_variants = self.get_query_to_df(query_variants) 7685 7686 # Added columns 7687 added_columns = [] 7688 7689 # Add hgvs column in variants table 7690 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7691 added_column = self.add_column( 7692 table_variants, hgvs_column_name, "STRING", default_value=None 7693 ) 7694 added_columns.append(added_column) 7695 7696 log.debug(f"refSeq loading...") 7697 # refSeq in duckDB 7698 refseq_table = get_refseq_table( 7699 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7700 ) 7701 # Loading all refSeq in Dataframe 7702 refseq_query = f""" 7703 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7704 FROM {refseq_table} 7705 JOIN df_variants ON ( 7706 {refseq_table}.chrom = df_variants.CHROM 7707 AND {refseq_table}.txStart<=df_variants.POS 7708 AND {refseq_table}.txEnd>=df_variants.POS 7709 ) 7710 """ 7711 refseq_df = self.conn.query(refseq_query).pl() 7712 7713 if refseqlink_file: 7714 log.debug(f"refSeqLink loading...") 7715 # refSeqLink in duckDB 7716 refseqlink_table = get_refseq_table( 7717 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7718 ) 7719 # Loading all refSeqLink in Dataframe 7720 protacc_column = "protAcc_with_ver" 7721 mrnaacc_column = "mrnaAcc_with_ver" 7722 refseqlink_query = f""" 7723 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7724 FROM {refseqlink_table} 7725 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7726 WHERE protAcc_without_ver IS NOT NULL 7727 """ 7728 # Polars Dataframe 7729 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7730 7731 # Read RefSeq transcripts into a python dict/model. 7732 log.debug(f"Transcripts loading...") 7733 with tempfile.TemporaryDirectory() as tmpdir: 7734 transcripts_query = f""" 7735 COPY ( 7736 SELECT {refseq_table}.* 7737 FROM {refseq_table} 7738 JOIN df_variants ON ( 7739 {refseq_table}.chrom=df_variants.CHROM 7740 AND {refseq_table}.txStart<=df_variants.POS 7741 AND {refseq_table}.txEnd>=df_variants.POS 7742 ) 7743 ) 7744 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7745 """ 7746 self.conn.query(transcripts_query) 7747 with open(f"{tmpdir}/transcript.tsv") as infile: 7748 transcripts = read_transcripts(infile) 7749 7750 # Polars connexion 7751 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7752 7753 log.debug("Genome loading...") 7754 # Read genome sequence using pyfaidx. 7755 genome = Fasta(genome_file) 7756 7757 log.debug("Start annotation HGVS...") 7758 7759 # Create 7760 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7761 ddf = dd.from_pandas(df_variants, npartitions=threads) 7762 7763 # Use dask.dataframe.apply() to apply function on each partition 7764 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7765 7766 # Convert Dask DataFrame to Pandas Dataframe 7767 df = ddf.compute() 7768 7769 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7770 with tempfile.TemporaryDirectory() as tmpdir: 7771 df_parquet = os.path.join(tmpdir, "df.parquet") 7772 df.to_parquet(df_parquet) 7773 7774 # Update hgvs column 7775 update_variant_query = f""" 7776 UPDATE {table_variants} 7777 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7778 FROM read_parquet('{df_parquet}') as df 7779 WHERE variants."#CHROM" = df.CHROM 7780 AND variants.POS = df.POS 7781 AND variants.REF = df.REF 7782 AND variants.ALT = df.ALT 7783 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7784 """ 7785 self.execute_query(update_variant_query) 7786 7787 # Update INFO column 7788 sql_query_update = f""" 7789 UPDATE {table_variants} 7790 SET INFO = 7791 concat( 7792 CASE 7793 WHEN INFO NOT IN ('','.') 7794 THEN concat(INFO, ';') 7795 ELSE '' 7796 END, 7797 'hgvs=', 7798 {hgvs_column_name} 7799 ) 7800 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7801 """ 7802 self.execute_query(sql_query_update) 7803 7804 # Add header 7805 HGVS_INFOS = { 7806 "hgvs": { 7807 "ID": "hgvs", 7808 "Number": ".", 7809 "Type": "String", 7810 "Description": f"HGVS annotatation with HOWARD", 7811 } 7812 } 7813 7814 for field in HGVS_INFOS: 7815 field_ID = HGVS_INFOS[field]["ID"] 7816 field_description = HGVS_INFOS[field]["Description"] 7817 self.get_header().infos[field_ID] = vcf.parser._Info( 7818 field_ID, 7819 HGVS_INFOS[field]["Number"], 7820 HGVS_INFOS[field]["Type"], 7821 field_description, 7822 "unknown", 7823 "unknown", 7824 code_type_map[HGVS_INFOS[field]["Type"]], 7825 ) 7826 7827 # Remove added columns 7828 for added_column in added_columns: 7829 self.drop_column(column=added_column) 7830 7831 ### 7832 # Calculation 7833 ### 7834 7835 def get_operations_help( 7836 self, operations_config_dict: dict = {}, operations_config_file: str = None 7837 ) -> list: 7838 7839 # Init 7840 operations_help = [] 7841 7842 # operations 7843 operations = self.get_config_json( 7844 name="calculations", 7845 config_dict=operations_config_dict, 7846 config_file=operations_config_file, 7847 ) 7848 for op in operations: 7849 op_name = operations[op].get("name", op).upper() 7850 op_description = operations[op].get("description", op_name) 7851 op_available = operations[op].get("available", False) 7852 if op_available: 7853 operations_help.append(f" {op_name}: {op_description}") 7854 7855 # Sort operations 7856 operations_help.sort() 7857 7858 # insert header 7859 operations_help.insert(0, "Available calculation operations:") 7860 7861 # Return 7862 return operations_help 7863 7864 def calculation( 7865 self, 7866 operations: dict = {}, 7867 operations_config_dict: dict = {}, 7868 operations_config_file: str = None, 7869 ) -> None: 7870 """ 7871 It takes a list of operations, and for each operation, it checks if it's a python or sql 7872 operation, and then calls the appropriate function 7873 7874 param json example: 7875 "calculation": { 7876 "NOMEN": { 7877 "options": { 7878 "hgvs_field": "hgvs" 7879 }, 7880 "middle" : null 7881 } 7882 """ 7883 7884 # Param 7885 param = self.get_param() 7886 7887 # operations config 7888 operations_config = self.get_config_json( 7889 name="calculations", 7890 config_dict=operations_config_dict, 7891 config_file=operations_config_file, 7892 ) 7893 7894 # Upper keys 7895 operations_config = {k.upper(): v for k, v in operations_config.items()} 7896 7897 # Calculations 7898 7899 # Operations from param 7900 operations = param.get("calculation", {}).get("calculations", operations) 7901 7902 # Quick calculation - add 7903 if param.get("calculations", None): 7904 calculations_list = [ 7905 value for value in param.get("calculations", "").split(",") 7906 ] 7907 log.info(f"Quick Calculations:") 7908 for calculation_key in calculations_list: 7909 log.info(f" {calculation_key}") 7910 for calculation_operation in calculations_list: 7911 if calculation_operation.upper() not in operations: 7912 operations[calculation_operation.upper()] = {} 7913 add_value_into_dict( 7914 dict_tree=param, 7915 sections=[ 7916 "calculation", 7917 "calculations", 7918 calculation_operation.upper(), 7919 ], 7920 value={}, 7921 ) 7922 7923 # Operations for calculation 7924 if not operations: 7925 operations = param.get("calculation", {}).get("calculations", {}) 7926 7927 if operations: 7928 log.info(f"Calculations...") 7929 7930 # For each operations 7931 for operation_name in operations: 7932 operation_name = operation_name.upper() 7933 if operation_name not in [""]: 7934 if operation_name in operations_config: 7935 log.info(f"Calculation '{operation_name}'") 7936 operation = operations_config[operation_name] 7937 operation_type = operation.get("type", "sql") 7938 if operation_type == "python": 7939 self.calculation_process_function( 7940 operation=operation, operation_name=operation_name 7941 ) 7942 elif operation_type == "sql": 7943 self.calculation_process_sql( 7944 operation=operation, operation_name=operation_name 7945 ) 7946 else: 7947 log.error( 7948 f"Operations config: Type '{operation_type}' NOT available" 7949 ) 7950 raise ValueError( 7951 f"Operations config: Type '{operation_type}' NOT available" 7952 ) 7953 else: 7954 log.error( 7955 f"Operations config: Calculation '{operation_name}' NOT available" 7956 ) 7957 raise ValueError( 7958 f"Operations config: Calculation '{operation_name}' NOT available" 7959 ) 7960 7961 # Explode INFOS fields into table fields 7962 if self.get_explode_infos(): 7963 self.explode_infos( 7964 prefix=self.get_explode_infos_prefix(), 7965 fields=self.get_explode_infos_fields(), 7966 force=True, 7967 ) 7968 7969 def calculation_process_sql( 7970 self, operation: dict, operation_name: str = "unknown" 7971 ) -> None: 7972 """ 7973 The `calculation_process_sql` function takes in a mathematical operation as a string and 7974 performs the operation, updating the specified table with the result. 7975 7976 :param operation: The `operation` parameter is a dictionary that contains information about the 7977 mathematical operation to be performed. It includes the following keys: 7978 :type operation: dict 7979 :param operation_name: The `operation_name` parameter is a string that represents the name of 7980 the mathematical operation being performed. It is used for logging and error handling purposes, 7981 defaults to unknown 7982 :type operation_name: str (optional) 7983 """ 7984 7985 # table variants 7986 table_variants = self.get_table_variants(clause="alter") 7987 7988 # Operation infos 7989 operation_name = operation.get("name", "unknown") 7990 log.debug(f"process sql {operation_name}") 7991 output_column_name = operation.get("output_column_name", operation_name) 7992 output_column_type = operation.get("output_column_type", "String") 7993 prefix = operation.get("explode_infos_prefix", "") 7994 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7995 output_column_description = operation.get( 7996 "output_column_description", f"{operation_name} operation" 7997 ) 7998 operation_query = operation.get("operation_query", None) 7999 if isinstance(operation_query, list): 8000 operation_query = " ".join(operation_query) 8001 operation_info_fields = operation.get("info_fields", []) 8002 operation_info_fields_check = operation.get("info_fields_check", False) 8003 operation_info = operation.get("operation_info", True) 8004 8005 if operation_query: 8006 8007 # Info fields check 8008 operation_info_fields_check_result = True 8009 if operation_info_fields_check: 8010 header_infos = self.get_header().infos 8011 for info_field in operation_info_fields: 8012 operation_info_fields_check_result = ( 8013 operation_info_fields_check_result 8014 and info_field in header_infos 8015 ) 8016 8017 # If info fields available 8018 if operation_info_fields_check_result: 8019 8020 # Added_columns 8021 added_columns = [] 8022 8023 # Create VCF header field 8024 vcf_reader = self.get_header() 8025 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8026 output_column_name, 8027 ".", 8028 output_column_type, 8029 output_column_description, 8030 "howard calculation", 8031 "0", 8032 self.code_type_map.get(output_column_type), 8033 ) 8034 8035 # Explode infos if needed 8036 log.debug(f"calculation_process_sql prefix {prefix}") 8037 added_columns += self.explode_infos( 8038 prefix=prefix, 8039 fields=[output_column_name] + operation_info_fields, 8040 force=True, 8041 ) 8042 8043 # Create column 8044 added_column = self.add_column( 8045 table_name=table_variants, 8046 column_name=prefix + output_column_name, 8047 column_type=output_column_type_sql, 8048 default_value="null", 8049 ) 8050 added_columns.append(added_column) 8051 8052 # Operation calculation 8053 try: 8054 8055 # Query to update calculation column 8056 sql_update = f""" 8057 UPDATE {table_variants} 8058 SET "{prefix}{output_column_name}" = ({operation_query}) 8059 """ 8060 self.conn.execute(sql_update) 8061 8062 # Add to INFO 8063 if operation_info: 8064 sql_update_info = f""" 8065 UPDATE {table_variants} 8066 SET "INFO" = 8067 concat( 8068 CASE 8069 WHEN "INFO" IS NOT NULL 8070 THEN concat("INFO", ';') 8071 ELSE '' 8072 END, 8073 '{output_column_name}=', 8074 "{prefix}{output_column_name}" 8075 ) 8076 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8077 """ 8078 self.conn.execute(sql_update_info) 8079 8080 except: 8081 log.error( 8082 f"Operations config: Calculation '{operation_name}' query failed" 8083 ) 8084 raise ValueError( 8085 f"Operations config: Calculation '{operation_name}' query failed" 8086 ) 8087 8088 # Remove added columns 8089 for added_column in added_columns: 8090 log.debug(f"added_column: {added_column}") 8091 self.drop_column(column=added_column) 8092 8093 else: 8094 log.error( 8095 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8096 ) 8097 raise ValueError( 8098 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8099 ) 8100 8101 else: 8102 log.error( 8103 f"Operations config: Calculation '{operation_name}' query NOT defined" 8104 ) 8105 raise ValueError( 8106 f"Operations config: Calculation '{operation_name}' query NOT defined" 8107 ) 8108 8109 def calculation_process_function( 8110 self, operation: dict, operation_name: str = "unknown" 8111 ) -> None: 8112 """ 8113 The `calculation_process_function` takes in an operation dictionary and performs the specified 8114 function with the given parameters. 8115 8116 :param operation: The `operation` parameter is a dictionary that contains information about the 8117 operation to be performed. It has the following keys: 8118 :type operation: dict 8119 :param operation_name: The `operation_name` parameter is a string that represents the name of 8120 the operation being performed. It is used for logging purposes, defaults to unknown 8121 :type operation_name: str (optional) 8122 """ 8123 8124 operation_name = operation["name"] 8125 log.debug(f"process sql {operation_name}") 8126 function_name = operation["function_name"] 8127 function_params = operation["function_params"] 8128 getattr(self, function_name)(*function_params) 8129 8130 def calculation_variant_id(self) -> None: 8131 """ 8132 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8133 updates the INFO field of a variants table with the variant ID. 8134 """ 8135 8136 # variant_id annotation field 8137 variant_id_tag = self.get_variant_id_column() 8138 added_columns = [variant_id_tag] 8139 8140 # variant_id hgvs tags" 8141 vcf_infos_tags = { 8142 variant_id_tag: "howard variant ID annotation", 8143 } 8144 8145 # Variants table 8146 table_variants = self.get_table_variants() 8147 8148 # Header 8149 vcf_reader = self.get_header() 8150 8151 # Add variant_id to header 8152 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8153 variant_id_tag, 8154 ".", 8155 "String", 8156 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8157 "howard calculation", 8158 "0", 8159 self.code_type_map.get("String"), 8160 ) 8161 8162 # Update 8163 sql_update = f""" 8164 UPDATE {table_variants} 8165 SET "INFO" = 8166 concat( 8167 CASE 8168 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8169 THEN '' 8170 ELSE concat("INFO", ';') 8171 END, 8172 '{variant_id_tag}=', 8173 "{variant_id_tag}" 8174 ) 8175 """ 8176 self.conn.execute(sql_update) 8177 8178 # Remove added columns 8179 for added_column in added_columns: 8180 self.drop_column(column=added_column) 8181 8182 def calculation_extract_snpeff_hgvs( 8183 self, 8184 snpeff_hgvs: str = "snpeff_hgvs", 8185 snpeff_field: str = "ANN", 8186 ) -> None: 8187 """ 8188 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8189 annotation field in a VCF file and adds them as a new column in the variants table. 8190 8191 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8192 function is used to specify the name of the column that will store the HGVS nomenclatures 8193 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8194 snpeff_hgvs 8195 :type snpeff_hgvs: str (optional) 8196 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8197 function represents the field in the VCF file that contains SnpEff annotations. This field is 8198 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8199 to ANN 8200 :type snpeff_field: str (optional) 8201 """ 8202 8203 # Snpeff hgvs tags 8204 vcf_infos_tags = { 8205 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8206 } 8207 8208 # Prefix 8209 prefix = self.get_explode_infos_prefix() 8210 if prefix: 8211 prefix = "INFO/" 8212 8213 # snpEff fields 8214 speff_ann_infos = prefix + snpeff_field 8215 speff_hgvs_infos = prefix + snpeff_hgvs 8216 8217 # Variants table 8218 table_variants = self.get_table_variants() 8219 8220 # Header 8221 vcf_reader = self.get_header() 8222 8223 # Add columns 8224 added_columns = [] 8225 8226 # Explode HGVS field in column 8227 added_columns += self.explode_infos(fields=[snpeff_field]) 8228 8229 if snpeff_field in vcf_reader.infos: 8230 8231 log.debug(vcf_reader.infos[snpeff_field]) 8232 8233 # Extract ANN header 8234 ann_description = vcf_reader.infos[snpeff_field].desc 8235 pattern = r"'(.+?)'" 8236 match = re.search(pattern, ann_description) 8237 if match: 8238 ann_header_match = match.group(1).split(" | ") 8239 ann_header_desc = {} 8240 for i in range(len(ann_header_match)): 8241 ann_header_info = "".join( 8242 char for char in ann_header_match[i] if char.isalnum() 8243 ) 8244 ann_header_desc[ann_header_info] = ann_header_match[i] 8245 if not ann_header_desc: 8246 raise ValueError("Invalid header description format") 8247 else: 8248 raise ValueError("Invalid header description format") 8249 8250 # Create variant id 8251 variant_id_column = self.get_variant_id_column() 8252 added_columns += [variant_id_column] 8253 8254 # Create dataframe 8255 dataframe_snpeff_hgvs = self.get_query_to_df( 8256 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8257 ) 8258 8259 # Create main NOMEN column 8260 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8261 speff_ann_infos 8262 ].apply( 8263 lambda x: extract_snpeff_hgvs( 8264 str(x), header=list(ann_header_desc.values()) 8265 ) 8266 ) 8267 8268 # Add snpeff_hgvs to header 8269 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8270 snpeff_hgvs, 8271 ".", 8272 "String", 8273 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8274 "howard calculation", 8275 "0", 8276 self.code_type_map.get("String"), 8277 ) 8278 8279 # Update 8280 sql_update = f""" 8281 UPDATE variants 8282 SET "INFO" = 8283 concat( 8284 CASE 8285 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8286 THEN '' 8287 ELSE concat("INFO", ';') 8288 END, 8289 CASE 8290 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8291 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8292 THEN concat( 8293 '{snpeff_hgvs}=', 8294 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8295 ) 8296 ELSE '' 8297 END 8298 ) 8299 FROM dataframe_snpeff_hgvs 8300 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8301 8302 """ 8303 self.conn.execute(sql_update) 8304 8305 # Delete dataframe 8306 del dataframe_snpeff_hgvs 8307 gc.collect() 8308 8309 else: 8310 8311 log.warning( 8312 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8313 ) 8314 8315 # Remove added columns 8316 for added_column in added_columns: 8317 self.drop_column(column=added_column) 8318 8319 def calculation_snpeff_ann_explode( 8320 self, 8321 uniquify: bool = True, 8322 output_format: str = "fields", 8323 output_prefix: str = "snpeff_", 8324 snpeff_field: str = "ANN", 8325 ) -> None: 8326 """ 8327 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8328 exploding the HGVS field and updating variant information accordingly. 8329 8330 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8331 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8332 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8333 defaults to True 8334 :type uniquify: bool (optional) 8335 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8336 function specifies the format in which the output annotations will be generated. It has a 8337 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8338 format, defaults to fields 8339 :type output_format: str (optional) 8340 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8341 method is used to specify the prefix that will be added to the output annotations generated 8342 during the calculation process. This prefix helps to differentiate the newly added annotations 8343 from existing ones in the output data. By default, the, defaults to ANN_ 8344 :type output_prefix: str (optional) 8345 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8346 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8347 field will be processed to explode the HGVS annotations and update the variant information 8348 accordingly, defaults to ANN 8349 :type snpeff_field: str (optional) 8350 """ 8351 8352 # SnpEff annotation field 8353 snpeff_hgvs = "snpeff_ann_explode" 8354 8355 # Snpeff hgvs tags 8356 vcf_infos_tags = { 8357 snpeff_hgvs: "Explode snpEff annotations", 8358 } 8359 8360 # Prefix 8361 prefix = self.get_explode_infos_prefix() 8362 if prefix: 8363 prefix = "INFO/" 8364 8365 # snpEff fields 8366 speff_ann_infos = prefix + snpeff_field 8367 speff_hgvs_infos = prefix + snpeff_hgvs 8368 8369 # Variants table 8370 table_variants = self.get_table_variants() 8371 8372 # Header 8373 vcf_reader = self.get_header() 8374 8375 # Add columns 8376 added_columns = [] 8377 8378 # Explode HGVS field in column 8379 added_columns += self.explode_infos(fields=[snpeff_field]) 8380 log.debug(f"snpeff_field={snpeff_field}") 8381 log.debug(f"added_columns={added_columns}") 8382 8383 if snpeff_field in vcf_reader.infos: 8384 8385 # Extract ANN header 8386 ann_description = vcf_reader.infos[snpeff_field].desc 8387 pattern = r"'(.+?)'" 8388 match = re.search(pattern, ann_description) 8389 if match: 8390 ann_header_match = match.group(1).split(" | ") 8391 ann_header = [] 8392 ann_header_desc = {} 8393 for i in range(len(ann_header_match)): 8394 ann_header_info = "".join( 8395 char for char in ann_header_match[i] if char.isalnum() 8396 ) 8397 ann_header.append(ann_header_info) 8398 ann_header_desc[ann_header_info] = ann_header_match[i] 8399 if not ann_header_desc: 8400 raise ValueError("Invalid header description format") 8401 else: 8402 raise ValueError("Invalid header description format") 8403 8404 # Create variant id 8405 variant_id_column = self.get_variant_id_column() 8406 added_columns += [variant_id_column] 8407 8408 # Create dataframe 8409 dataframe_snpeff_hgvs = self.get_query_to_df( 8410 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8411 ) 8412 8413 # Create snpEff columns 8414 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8415 speff_ann_infos 8416 ].apply( 8417 lambda x: explode_snpeff_ann( 8418 str(x), 8419 uniquify=uniquify, 8420 output_format=output_format, 8421 prefix=output_prefix, 8422 header=list(ann_header_desc.values()), 8423 ) 8424 ) 8425 8426 # Header 8427 ann_annotations_prefix = "" 8428 if output_format.upper() in ["JSON"]: 8429 ann_annotations_prefix = f"{output_prefix}=" 8430 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8431 output_prefix, 8432 ".", 8433 "String", 8434 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8435 + " - JSON format", 8436 "howard calculation", 8437 "0", 8438 self.code_type_map.get("String"), 8439 ) 8440 else: 8441 for ann_annotation in ann_header: 8442 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8443 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8444 ann_annotation_id, 8445 ".", 8446 "String", 8447 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8448 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8449 "howard calculation", 8450 "0", 8451 self.code_type_map.get("String"), 8452 ) 8453 8454 # Update 8455 sql_update = f""" 8456 UPDATE variants 8457 SET "INFO" = 8458 concat( 8459 CASE 8460 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8461 THEN '' 8462 ELSE concat("INFO", ';') 8463 END, 8464 CASE 8465 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8466 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8467 THEN concat( 8468 '{ann_annotations_prefix}', 8469 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8470 ) 8471 ELSE '' 8472 END 8473 ) 8474 FROM dataframe_snpeff_hgvs 8475 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8476 8477 """ 8478 self.conn.execute(sql_update) 8479 8480 # Delete dataframe 8481 del dataframe_snpeff_hgvs 8482 gc.collect() 8483 8484 else: 8485 8486 log.warning( 8487 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8488 ) 8489 8490 # Remove added columns 8491 for added_column in added_columns: 8492 self.drop_column(column=added_column) 8493 8494 def calculation_extract_nomen(self) -> None: 8495 """ 8496 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8497 """ 8498 8499 # NOMEN field 8500 field_nomen_dict = "NOMEN_DICT" 8501 8502 # NOMEN structure 8503 nomen_dict = { 8504 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8505 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8506 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8507 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8508 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8509 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8510 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8511 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8512 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8513 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8514 } 8515 8516 # Param 8517 param = self.get_param() 8518 8519 # Prefix 8520 prefix = self.get_explode_infos_prefix() 8521 8522 # Header 8523 vcf_reader = self.get_header() 8524 8525 # Get HGVS field 8526 hgvs_field = ( 8527 param.get("calculation", {}) 8528 .get("calculations", {}) 8529 .get("NOMEN", {}) 8530 .get("options", {}) 8531 .get("hgvs_field", "hgvs") 8532 ) 8533 8534 # Get transcripts 8535 transcripts_file = ( 8536 param.get("calculation", {}) 8537 .get("calculations", {}) 8538 .get("NOMEN", {}) 8539 .get("options", {}) 8540 .get("transcripts", None) 8541 ) 8542 transcripts_file = full_path(transcripts_file) 8543 transcripts = [] 8544 if transcripts_file: 8545 if os.path.exists(transcripts_file): 8546 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8547 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8548 else: 8549 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8550 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8551 8552 # Added columns 8553 added_columns = [] 8554 8555 # Explode HGVS field in column 8556 added_columns += self.explode_infos(fields=[hgvs_field]) 8557 8558 # extra infos 8559 extra_infos = self.get_extra_infos() 8560 extra_field = prefix + hgvs_field 8561 8562 if extra_field in extra_infos: 8563 8564 # Create dataframe 8565 dataframe_hgvs = self.get_query_to_df( 8566 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8567 ) 8568 8569 # Create main NOMEN column 8570 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8571 lambda x: find_nomen(str(x), transcripts=transcripts) 8572 ) 8573 8574 # Explode NOMEN Structure and create SQL set for update 8575 sql_nomen_fields = [] 8576 for nomen_field in nomen_dict: 8577 8578 # Explode each field into a column 8579 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8580 lambda x: dict(x).get(nomen_field, "") 8581 ) 8582 8583 # Create VCF header field 8584 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8585 nomen_field, 8586 ".", 8587 "String", 8588 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8589 "howard calculation", 8590 "0", 8591 self.code_type_map.get("String"), 8592 ) 8593 sql_nomen_fields.append( 8594 f""" 8595 CASE 8596 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8597 THEN concat( 8598 ';{nomen_field}=', 8599 dataframe_hgvs."{nomen_field}" 8600 ) 8601 ELSE '' 8602 END 8603 """ 8604 ) 8605 8606 # SQL set for update 8607 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8608 8609 # Update 8610 sql_update = f""" 8611 UPDATE variants 8612 SET "INFO" = 8613 concat( 8614 CASE 8615 WHEN "INFO" IS NULL 8616 THEN '' 8617 ELSE "INFO" 8618 END, 8619 {sql_nomen_fields_set} 8620 ) 8621 FROM dataframe_hgvs 8622 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8623 AND variants."POS" = dataframe_hgvs."POS" 8624 AND variants."REF" = dataframe_hgvs."REF" 8625 AND variants."ALT" = dataframe_hgvs."ALT" 8626 """ 8627 self.conn.execute(sql_update) 8628 8629 # Delete dataframe 8630 del dataframe_hgvs 8631 gc.collect() 8632 8633 # Remove added columns 8634 for added_column in added_columns: 8635 self.drop_column(column=added_column) 8636 8637 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8638 """ 8639 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8640 pipeline/sample for a variant and updates the variant information in a VCF file. 8641 8642 :param tag: The `tag` parameter is a string that represents the annotation field for the 8643 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8644 VCF header and to update the corresponding field in the variants table, defaults to 8645 findbypipeline 8646 :type tag: str (optional) 8647 """ 8648 8649 # if FORMAT and samples 8650 if ( 8651 "FORMAT" in self.get_header_columns_as_list() 8652 and self.get_header_sample_list() 8653 ): 8654 8655 # findbypipeline annotation field 8656 findbypipeline_tag = tag 8657 8658 # VCF infos tags 8659 vcf_infos_tags = { 8660 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8661 } 8662 8663 # Prefix 8664 prefix = self.get_explode_infos_prefix() 8665 8666 # Field 8667 findbypipeline_infos = prefix + findbypipeline_tag 8668 8669 # Variants table 8670 table_variants = self.get_table_variants() 8671 8672 # Header 8673 vcf_reader = self.get_header() 8674 8675 # Create variant id 8676 variant_id_column = self.get_variant_id_column() 8677 added_columns = [variant_id_column] 8678 8679 # variant_id, FORMAT and samples 8680 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8681 self.get_header_sample_list() 8682 ) 8683 8684 # Create dataframe 8685 dataframe_findbypipeline = self.get_query_to_df( 8686 f""" SELECT {samples_fields} FROM {table_variants} """ 8687 ) 8688 8689 # Create findbypipeline column 8690 dataframe_findbypipeline[findbypipeline_infos] = ( 8691 dataframe_findbypipeline.apply( 8692 lambda row: findbypipeline( 8693 row, samples=self.get_header_sample_list() 8694 ), 8695 axis=1, 8696 ) 8697 ) 8698 8699 # Add snpeff_hgvs to header 8700 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8701 findbypipeline_tag, 8702 ".", 8703 "String", 8704 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8705 "howard calculation", 8706 "0", 8707 self.code_type_map.get("String"), 8708 ) 8709 8710 # Update 8711 sql_update = f""" 8712 UPDATE variants 8713 SET "INFO" = 8714 concat( 8715 CASE 8716 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8717 THEN '' 8718 ELSE concat("INFO", ';') 8719 END, 8720 CASE 8721 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8722 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8723 THEN concat( 8724 '{findbypipeline_tag}=', 8725 dataframe_findbypipeline."{findbypipeline_infos}" 8726 ) 8727 ELSE '' 8728 END 8729 ) 8730 FROM dataframe_findbypipeline 8731 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8732 """ 8733 self.conn.execute(sql_update) 8734 8735 # Remove added columns 8736 for added_column in added_columns: 8737 self.drop_column(column=added_column) 8738 8739 # Delete dataframe 8740 del dataframe_findbypipeline 8741 gc.collect() 8742 8743 def calculation_genotype_concordance(self) -> None: 8744 """ 8745 The function `calculation_genotype_concordance` calculates the genotype concordance for 8746 multi-caller VCF files and updates the variant information in the database. 8747 """ 8748 8749 # if FORMAT and samples 8750 if ( 8751 "FORMAT" in self.get_header_columns_as_list() 8752 and self.get_header_sample_list() 8753 ): 8754 8755 # genotypeconcordance annotation field 8756 genotypeconcordance_tag = "genotypeconcordance" 8757 8758 # VCF infos tags 8759 vcf_infos_tags = { 8760 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8761 } 8762 8763 # Prefix 8764 prefix = self.get_explode_infos_prefix() 8765 8766 # Field 8767 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8768 8769 # Variants table 8770 table_variants = self.get_table_variants() 8771 8772 # Header 8773 vcf_reader = self.get_header() 8774 8775 # Create variant id 8776 variant_id_column = self.get_variant_id_column() 8777 added_columns = [variant_id_column] 8778 8779 # variant_id, FORMAT and samples 8780 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8781 self.get_header_sample_list() 8782 ) 8783 8784 # Create dataframe 8785 dataframe_genotypeconcordance = self.get_query_to_df( 8786 f""" SELECT {samples_fields} FROM {table_variants} """ 8787 ) 8788 8789 # Create genotypeconcordance column 8790 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8791 dataframe_genotypeconcordance.apply( 8792 lambda row: genotypeconcordance( 8793 row, samples=self.get_header_sample_list() 8794 ), 8795 axis=1, 8796 ) 8797 ) 8798 8799 # Add genotypeconcordance to header 8800 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8801 genotypeconcordance_tag, 8802 ".", 8803 "String", 8804 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8805 "howard calculation", 8806 "0", 8807 self.code_type_map.get("String"), 8808 ) 8809 8810 # Update 8811 sql_update = f""" 8812 UPDATE variants 8813 SET "INFO" = 8814 concat( 8815 CASE 8816 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8817 THEN '' 8818 ELSE concat("INFO", ';') 8819 END, 8820 CASE 8821 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8822 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8823 THEN concat( 8824 '{genotypeconcordance_tag}=', 8825 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8826 ) 8827 ELSE '' 8828 END 8829 ) 8830 FROM dataframe_genotypeconcordance 8831 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8832 """ 8833 self.conn.execute(sql_update) 8834 8835 # Remove added columns 8836 for added_column in added_columns: 8837 self.drop_column(column=added_column) 8838 8839 # Delete dataframe 8840 del dataframe_genotypeconcordance 8841 gc.collect() 8842 8843 def calculation_barcode(self, tag: str = "barcode") -> None: 8844 """ 8845 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8846 updates the INFO field in the file with the calculated barcode values. 8847 8848 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8849 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8850 the default tag name is set to "barcode", defaults to barcode 8851 :type tag: str (optional) 8852 """ 8853 8854 # if FORMAT and samples 8855 if ( 8856 "FORMAT" in self.get_header_columns_as_list() 8857 and self.get_header_sample_list() 8858 ): 8859 8860 # barcode annotation field 8861 if not tag: 8862 tag = "barcode" 8863 8864 # VCF infos tags 8865 vcf_infos_tags = { 8866 tag: "barcode calculation (VaRank)", 8867 } 8868 8869 # Prefix 8870 prefix = self.get_explode_infos_prefix() 8871 8872 # Field 8873 barcode_infos = prefix + tag 8874 8875 # Variants table 8876 table_variants = self.get_table_variants() 8877 8878 # Header 8879 vcf_reader = self.get_header() 8880 8881 # Create variant id 8882 variant_id_column = self.get_variant_id_column() 8883 added_columns = [variant_id_column] 8884 8885 # variant_id, FORMAT and samples 8886 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8887 self.get_header_sample_list() 8888 ) 8889 8890 # Create dataframe 8891 dataframe_barcode = self.get_query_to_df( 8892 f""" SELECT {samples_fields} FROM {table_variants} """ 8893 ) 8894 8895 # Create barcode column 8896 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8897 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8898 ) 8899 8900 # Add barcode to header 8901 vcf_reader.infos[tag] = vcf.parser._Info( 8902 tag, 8903 ".", 8904 "String", 8905 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8906 "howard calculation", 8907 "0", 8908 self.code_type_map.get("String"), 8909 ) 8910 8911 # Update 8912 sql_update = f""" 8913 UPDATE {table_variants} 8914 SET "INFO" = 8915 concat( 8916 CASE 8917 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8918 THEN '' 8919 ELSE concat("INFO", ';') 8920 END, 8921 CASE 8922 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8923 AND dataframe_barcode."{barcode_infos}" NOT NULL 8924 THEN concat( 8925 '{tag}=', 8926 dataframe_barcode."{barcode_infos}" 8927 ) 8928 ELSE '' 8929 END 8930 ) 8931 FROM dataframe_barcode 8932 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8933 """ 8934 self.conn.execute(sql_update) 8935 8936 # Remove added columns 8937 for added_column in added_columns: 8938 self.drop_column(column=added_column) 8939 8940 # Delete dataframe 8941 del dataframe_barcode 8942 gc.collect() 8943 8944 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8945 """ 8946 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8947 and updates the INFO field in the file with the calculated barcode values. 8948 8949 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8950 the barcode tag that will be added to the VCF file during the calculation process. If no value 8951 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8952 :type tag: str (optional) 8953 """ 8954 8955 # if FORMAT and samples 8956 if ( 8957 "FORMAT" in self.get_header_columns_as_list() 8958 and self.get_header_sample_list() 8959 ): 8960 8961 # barcode annotation field 8962 if not tag: 8963 tag = "BCF" 8964 8965 # VCF infos tags 8966 vcf_infos_tags = { 8967 tag: "barcode family calculation", 8968 f"{tag}S": "barcode family samples", 8969 } 8970 8971 # Param 8972 param = self.get_param() 8973 log.debug(f"param={param}") 8974 8975 # Prefix 8976 prefix = self.get_explode_infos_prefix() 8977 8978 # PED param 8979 ped = ( 8980 param.get("calculation", {}) 8981 .get("calculations", {}) 8982 .get("BARCODEFAMILY", {}) 8983 .get("family_pedigree", None) 8984 ) 8985 log.debug(f"ped={ped}") 8986 8987 # Load PED 8988 if ped: 8989 8990 # Pedigree is a file 8991 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8992 log.debug("Pedigree is file") 8993 with open(full_path(ped)) as ped: 8994 ped = json.load(ped) 8995 8996 # Pedigree is a string 8997 elif isinstance(ped, str): 8998 log.debug("Pedigree is str") 8999 try: 9000 ped = json.loads(ped) 9001 log.debug("Pedigree is json str") 9002 except ValueError as e: 9003 ped_samples = ped.split(",") 9004 ped = {} 9005 for ped_sample in ped_samples: 9006 ped[ped_sample] = ped_sample 9007 9008 # Pedigree is a dict 9009 elif isinstance(ped, dict): 9010 log.debug("Pedigree is dict") 9011 9012 # Pedigree is not well formatted 9013 else: 9014 msg_error = "Pedigree not well formatted" 9015 log.error(msg_error) 9016 raise ValueError(msg_error) 9017 9018 # Construct list 9019 ped_samples = list(ped.values()) 9020 9021 else: 9022 log.debug("Pedigree not defined. Take all samples") 9023 ped_samples = self.get_header_sample_list() 9024 ped = {} 9025 for ped_sample in ped_samples: 9026 ped[ped_sample] = ped_sample 9027 9028 # Check pedigree 9029 if not ped or len(ped) == 0: 9030 msg_error = f"Error in pedigree: samples {ped_samples}" 9031 log.error(msg_error) 9032 raise ValueError(msg_error) 9033 9034 # Log 9035 log.info( 9036 "Calculation 'BARCODEFAMILY' - Samples: " 9037 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9038 ) 9039 log.debug(f"ped_samples={ped_samples}") 9040 9041 # Field 9042 barcode_infos = prefix + tag 9043 9044 # Variants table 9045 table_variants = self.get_table_variants() 9046 9047 # Header 9048 vcf_reader = self.get_header() 9049 9050 # Create variant id 9051 variant_id_column = self.get_variant_id_column() 9052 added_columns = [variant_id_column] 9053 9054 # variant_id, FORMAT and samples 9055 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9056 ped_samples 9057 ) 9058 9059 # Create dataframe 9060 dataframe_barcode = self.get_query_to_df( 9061 f""" SELECT {samples_fields} FROM {table_variants} """ 9062 ) 9063 9064 # Create barcode column 9065 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9066 lambda row: barcode(row, samples=ped_samples), axis=1 9067 ) 9068 9069 # Add barcode family to header 9070 # Add vaf_normalization to header 9071 vcf_reader.formats[tag] = vcf.parser._Format( 9072 id=tag, 9073 num=".", 9074 type="String", 9075 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9076 type_code=self.code_type_map.get("String"), 9077 ) 9078 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9079 id=f"{tag}S", 9080 num=".", 9081 type="String", 9082 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9083 type_code=self.code_type_map.get("String"), 9084 ) 9085 9086 # Update 9087 # for sample in ped_samples: 9088 sql_update_set = [] 9089 for sample in self.get_header_sample_list() + ["FORMAT"]: 9090 if sample in ped_samples: 9091 value = f'dataframe_barcode."{barcode_infos}"' 9092 value_samples = "'" + ",".join(ped_samples) + "'" 9093 elif sample == "FORMAT": 9094 value = f"'{tag}'" 9095 value_samples = f"'{tag}S'" 9096 else: 9097 value = "'.'" 9098 value_samples = "'.'" 9099 format_regex = r"[a-zA-Z0-9\s]" 9100 sql_update_set.append( 9101 f""" 9102 "{sample}" = 9103 concat( 9104 CASE 9105 WHEN {table_variants}."{sample}" = './.' 9106 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9107 ELSE {table_variants}."{sample}" 9108 END, 9109 ':', 9110 {value}, 9111 ':', 9112 {value_samples} 9113 ) 9114 """ 9115 ) 9116 9117 sql_update_set_join = ", ".join(sql_update_set) 9118 sql_update = f""" 9119 UPDATE {table_variants} 9120 SET {sql_update_set_join} 9121 FROM dataframe_barcode 9122 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9123 """ 9124 self.conn.execute(sql_update) 9125 9126 # Remove added columns 9127 for added_column in added_columns: 9128 self.drop_column(column=added_column) 9129 9130 # Delete dataframe 9131 del dataframe_barcode 9132 gc.collect() 9133 9134 def calculation_trio(self) -> None: 9135 """ 9136 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9137 information to the INFO field of each variant. 9138 """ 9139 9140 # if FORMAT and samples 9141 if ( 9142 "FORMAT" in self.get_header_columns_as_list() 9143 and self.get_header_sample_list() 9144 ): 9145 9146 # trio annotation field 9147 trio_tag = "trio" 9148 9149 # VCF infos tags 9150 vcf_infos_tags = { 9151 "trio": "trio calculation", 9152 } 9153 9154 # Param 9155 param = self.get_param() 9156 9157 # Prefix 9158 prefix = self.get_explode_infos_prefix() 9159 9160 # Trio param 9161 trio_ped = ( 9162 param.get("calculation", {}) 9163 .get("calculations", {}) 9164 .get("TRIO", {}) 9165 .get("trio_pedigree", None) 9166 ) 9167 9168 # Load trio 9169 if trio_ped: 9170 9171 # Trio pedigree is a file 9172 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9173 log.debug("TRIO pedigree is file") 9174 with open(full_path(trio_ped)) as trio_ped: 9175 trio_ped = json.load(trio_ped) 9176 9177 # Trio pedigree is a string 9178 elif isinstance(trio_ped, str): 9179 log.debug("TRIO pedigree is str") 9180 try: 9181 trio_ped = json.loads(trio_ped) 9182 log.debug("TRIO pedigree is json str") 9183 except ValueError as e: 9184 trio_samples = trio_ped.split(",") 9185 if len(trio_samples) == 3: 9186 trio_ped = { 9187 "father": trio_samples[0], 9188 "mother": trio_samples[1], 9189 "child": trio_samples[2], 9190 } 9191 log.debug("TRIO pedigree is list str") 9192 else: 9193 msg_error = "TRIO pedigree not well formatted" 9194 log.error(msg_error) 9195 raise ValueError(msg_error) 9196 9197 # Trio pedigree is a dict 9198 elif isinstance(trio_ped, dict): 9199 log.debug("TRIO pedigree is dict") 9200 9201 # Trio pedigree is not well formatted 9202 else: 9203 msg_error = "TRIO pedigree not well formatted" 9204 log.error(msg_error) 9205 raise ValueError(msg_error) 9206 9207 # Construct trio list 9208 trio_samples = [ 9209 trio_ped.get("father", ""), 9210 trio_ped.get("mother", ""), 9211 trio_ped.get("child", ""), 9212 ] 9213 9214 else: 9215 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9216 samples_list = self.get_header_sample_list() 9217 if len(samples_list) >= 3: 9218 trio_samples = self.get_header_sample_list()[0:3] 9219 trio_ped = { 9220 "father": trio_samples[0], 9221 "mother": trio_samples[1], 9222 "child": trio_samples[2], 9223 } 9224 else: 9225 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9226 log.error(msg_error) 9227 raise ValueError(msg_error) 9228 9229 # Check trio pedigree 9230 if not trio_ped or len(trio_ped) != 3: 9231 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9232 log.error(msg_error) 9233 raise ValueError(msg_error) 9234 9235 # Log 9236 log.info( 9237 f"Calculation 'TRIO' - Samples: " 9238 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9239 ) 9240 9241 # Field 9242 trio_infos = prefix + trio_tag 9243 9244 # Variants table 9245 table_variants = self.get_table_variants() 9246 9247 # Header 9248 vcf_reader = self.get_header() 9249 9250 # Create variant id 9251 variant_id_column = self.get_variant_id_column() 9252 added_columns = [variant_id_column] 9253 9254 # variant_id, FORMAT and samples 9255 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9256 self.get_header_sample_list() 9257 ) 9258 9259 # Create dataframe 9260 dataframe_trio = self.get_query_to_df( 9261 f""" SELECT {samples_fields} FROM {table_variants} """ 9262 ) 9263 9264 # Create trio column 9265 dataframe_trio[trio_infos] = dataframe_trio.apply( 9266 lambda row: trio(row, samples=trio_samples), axis=1 9267 ) 9268 9269 # Add trio to header 9270 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9271 trio_tag, 9272 ".", 9273 "String", 9274 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9275 "howard calculation", 9276 "0", 9277 self.code_type_map.get("String"), 9278 ) 9279 9280 # Update 9281 sql_update = f""" 9282 UPDATE {table_variants} 9283 SET "INFO" = 9284 concat( 9285 CASE 9286 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9287 THEN '' 9288 ELSE concat("INFO", ';') 9289 END, 9290 CASE 9291 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9292 AND dataframe_trio."{trio_infos}" NOT NULL 9293 THEN concat( 9294 '{trio_tag}=', 9295 dataframe_trio."{trio_infos}" 9296 ) 9297 ELSE '' 9298 END 9299 ) 9300 FROM dataframe_trio 9301 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9302 """ 9303 self.conn.execute(sql_update) 9304 9305 # Remove added columns 9306 for added_column in added_columns: 9307 self.drop_column(column=added_column) 9308 9309 # Delete dataframe 9310 del dataframe_trio 9311 gc.collect() 9312 9313 def calculation_vaf_normalization(self) -> None: 9314 """ 9315 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9316 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9317 :return: The function does not return anything. 9318 """ 9319 9320 # if FORMAT and samples 9321 if ( 9322 "FORMAT" in self.get_header_columns_as_list() 9323 and self.get_header_sample_list() 9324 ): 9325 9326 # vaf_normalization annotation field 9327 vaf_normalization_tag = "VAF" 9328 9329 # VCF infos tags 9330 vcf_infos_tags = { 9331 "VAF": "VAF Variant Frequency", 9332 } 9333 9334 # Prefix 9335 prefix = self.get_explode_infos_prefix() 9336 9337 # Variants table 9338 table_variants = self.get_table_variants() 9339 9340 # Header 9341 vcf_reader = self.get_header() 9342 9343 # Do not calculate if VAF already exists 9344 if "VAF" in vcf_reader.formats: 9345 log.debug("VAF already on genotypes") 9346 return 9347 9348 # Create variant id 9349 variant_id_column = self.get_variant_id_column() 9350 added_columns = [variant_id_column] 9351 9352 # variant_id, FORMAT and samples 9353 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9354 f""" "{sample}" """ for sample in self.get_header_sample_list() 9355 ) 9356 9357 # Create dataframe 9358 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9359 log.debug(f"query={query}") 9360 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9361 9362 vaf_normalization_set = [] 9363 9364 # for each sample vaf_normalization 9365 for sample in self.get_header_sample_list(): 9366 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9367 lambda row: vaf_normalization(row, sample=sample), axis=1 9368 ) 9369 vaf_normalization_set.append( 9370 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9371 ) 9372 9373 # Add VAF to FORMAT 9374 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9375 "FORMAT" 9376 ].apply(lambda x: str(x) + ":VAF") 9377 vaf_normalization_set.append( 9378 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9379 ) 9380 9381 # Add vaf_normalization to header 9382 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9383 id=vaf_normalization_tag, 9384 num="1", 9385 type="Float", 9386 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9387 type_code=self.code_type_map.get("Float"), 9388 ) 9389 9390 # Create fields to add in INFO 9391 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9392 9393 # Update 9394 sql_update = f""" 9395 UPDATE {table_variants} 9396 SET {sql_vaf_normalization_set} 9397 FROM dataframe_vaf_normalization 9398 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9399 9400 """ 9401 self.conn.execute(sql_update) 9402 9403 # Remove added columns 9404 for added_column in added_columns: 9405 self.drop_column(column=added_column) 9406 9407 # Delete dataframe 9408 del dataframe_vaf_normalization 9409 gc.collect() 9410 9411 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9412 """ 9413 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9414 field in a VCF file and updates the INFO column of the variants table with the calculated 9415 statistics. 9416 9417 :param info: The `info` parameter is a string that represents the type of information for which 9418 genotype statistics are calculated. It is used to generate various VCF info tags for the 9419 statistics, such as the number of occurrences, the list of values, the minimum value, the 9420 maximum value, the mean, the median, defaults to VAF 9421 :type info: str (optional) 9422 """ 9423 9424 # if FORMAT and samples 9425 if ( 9426 "FORMAT" in self.get_header_columns_as_list() 9427 and self.get_header_sample_list() 9428 ): 9429 9430 # vaf_stats annotation field 9431 vaf_stats_tag = info + "_stats" 9432 9433 # VCF infos tags 9434 vcf_infos_tags = { 9435 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9436 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9437 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9438 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9439 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9440 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9441 info 9442 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9443 } 9444 9445 # Prefix 9446 prefix = self.get_explode_infos_prefix() 9447 9448 # Field 9449 vaf_stats_infos = prefix + vaf_stats_tag 9450 9451 # Variants table 9452 table_variants = self.get_table_variants() 9453 9454 # Header 9455 vcf_reader = self.get_header() 9456 9457 # Create variant id 9458 variant_id_column = self.get_variant_id_column() 9459 added_columns = [variant_id_column] 9460 9461 # variant_id, FORMAT and samples 9462 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9463 self.get_header_sample_list() 9464 ) 9465 9466 # Create dataframe 9467 dataframe_vaf_stats = self.get_query_to_df( 9468 f""" SELECT {samples_fields} FROM {table_variants} """ 9469 ) 9470 9471 # Create vaf_stats column 9472 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9473 lambda row: genotype_stats( 9474 row, samples=self.get_header_sample_list(), info=info 9475 ), 9476 axis=1, 9477 ) 9478 9479 # List of vcf tags 9480 sql_vaf_stats_fields = [] 9481 9482 # Check all VAF stats infos 9483 for stat in vcf_infos_tags: 9484 9485 # Extract stats 9486 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9487 lambda x: dict(x).get(stat, "") 9488 ) 9489 9490 # Add snpeff_hgvs to header 9491 vcf_reader.infos[stat] = vcf.parser._Info( 9492 stat, 9493 ".", 9494 "String", 9495 vcf_infos_tags.get(stat, "genotype statistics"), 9496 "howard calculation", 9497 "0", 9498 self.code_type_map.get("String"), 9499 ) 9500 9501 if len(sql_vaf_stats_fields): 9502 sep = ";" 9503 else: 9504 sep = "" 9505 9506 # Create fields to add in INFO 9507 sql_vaf_stats_fields.append( 9508 f""" 9509 CASE 9510 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9511 THEN concat( 9512 '{sep}{stat}=', 9513 dataframe_vaf_stats."{stat}" 9514 ) 9515 ELSE '' 9516 END 9517 """ 9518 ) 9519 9520 # SQL set for update 9521 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9522 9523 # Update 9524 sql_update = f""" 9525 UPDATE {table_variants} 9526 SET "INFO" = 9527 concat( 9528 CASE 9529 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9530 THEN '' 9531 ELSE concat("INFO", ';') 9532 END, 9533 {sql_vaf_stats_fields_set} 9534 ) 9535 FROM dataframe_vaf_stats 9536 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9537 9538 """ 9539 self.conn.execute(sql_update) 9540 9541 # Remove added columns 9542 for added_column in added_columns: 9543 self.drop_column(column=added_column) 9544 9545 # Delete dataframe 9546 del dataframe_vaf_stats 9547 gc.collect() 9548 9549 def calculation_transcripts_annotation( 9550 self, info_json: str = None, info_format: str = None 9551 ) -> None: 9552 """ 9553 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9554 field to it if transcripts are available. 9555 9556 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9557 is a string parameter that represents the information field to be used in the transcripts JSON. 9558 It is used to specify the JSON format for the transcripts information. If no value is provided 9559 when calling the method, it defaults to " 9560 :type info_json: str 9561 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9562 method is a string parameter that specifies the format of the information field to be used in 9563 the transcripts JSON. It is used to define the format of the information field 9564 :type info_format: str 9565 """ 9566 9567 # Create transcripts table 9568 transcripts_table = self.create_transcript_view() 9569 9570 # Add info field 9571 if transcripts_table: 9572 self.transcript_view_to_variants( 9573 transcripts_table=transcripts_table, 9574 transcripts_info_field_json=info_json, 9575 transcripts_info_field_format=info_format, 9576 ) 9577 else: 9578 log.info("No Transcripts to process. Check param.json file configuration") 9579 9580 def calculation_transcripts_prioritization(self) -> None: 9581 """ 9582 The function `calculation_transcripts_prioritization` creates a transcripts table and 9583 prioritizes transcripts based on certain criteria. 9584 """ 9585 9586 # Create transcripts table 9587 transcripts_table = self.create_transcript_view() 9588 9589 # Add info field 9590 if transcripts_table: 9591 self.transcripts_prioritization(transcripts_table=transcripts_table) 9592 else: 9593 log.info("No Transcripts to process. Check param.json file configuration") 9594 9595 ############### 9596 # Transcripts # 9597 ############### 9598 9599 def transcripts_prioritization( 9600 self, transcripts_table: str = None, param: dict = {} 9601 ) -> bool: 9602 """ 9603 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9604 and updates the variants table with the prioritized information. 9605 9606 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9607 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9608 This parameter is used to identify the table where the transcripts data is stored for the 9609 prioritization process 9610 :type transcripts_table: str 9611 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9612 that contains various configuration settings for the prioritization process of transcripts. It 9613 is used to customize the behavior of the prioritization algorithm and includes settings such as 9614 the prefix for prioritization fields, default profiles, and other 9615 :type param: dict 9616 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9617 transcripts prioritization process is successfully completed, and `False` if there are any 9618 issues or if no profile is defined for transcripts prioritization. 9619 """ 9620 9621 log.debug("Start transcripts prioritization...") 9622 9623 # Param 9624 if not param: 9625 param = self.get_param() 9626 9627 # Variants table 9628 table_variants = self.get_table_variants() 9629 log.debug(f"transcripts_table={transcripts_table}") 9630 # Transcripts table 9631 if transcripts_table is None: 9632 log.debug(f"transcripts_table={transcripts_table}") 9633 transcripts_table = self.create_transcript_view( 9634 transcripts_table="transcripts", param=param 9635 ) 9636 log.debug(f"transcripts_table={transcripts_table}") 9637 if transcripts_table is None: 9638 msg_err = "No Transcripts table availalble" 9639 log.error(msg_err) 9640 raise ValueError(msg_err) 9641 9642 # Get transcripts columns 9643 columns_as_list_query = f""" 9644 DESCRIBE {transcripts_table} 9645 """ 9646 columns_as_list = list( 9647 self.get_query_to_df(columns_as_list_query)["column_name"] 9648 ) 9649 9650 # Create INFO if not exists 9651 if "INFO" not in columns_as_list: 9652 query_add_info = f""" 9653 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9654 """ 9655 self.execute_query(query_add_info) 9656 9657 # Prioritization param and Force only PZ Score and Flag 9658 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9659 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9660 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9661 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9662 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9663 pz_profile_default = ( 9664 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9665 ) 9666 9667 # Exit if no profile 9668 if pz_profile_default is None: 9669 log.warning("No profile defined for transcripts prioritization") 9670 return False 9671 9672 # Prioritization 9673 prioritization_result = self.prioritization( 9674 table=transcripts_table, 9675 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9676 ) 9677 if not prioritization_result: 9678 log.warning("Transcripts prioritization not processed") 9679 return False 9680 9681 # Explode PZ fields 9682 self.explode_infos( 9683 table=transcripts_table, 9684 fields=param.get("transcripts", {}) 9685 .get("prioritization", {}) 9686 .get("pzfields", []), 9687 ) 9688 9689 # Export Transcripts prioritization infos to variants table 9690 query_update = f""" 9691 WITH RankedTranscripts AS ( 9692 SELECT 9693 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9694 ROW_NUMBER() OVER ( 9695 PARTITION BY "#CHROM", POS, REF, ALT 9696 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9697 ) AS rn 9698 FROM 9699 {transcripts_table} 9700 ) 9701 UPDATE {table_variants} 9702 SET 9703 INFO = CONCAT(CASE 9704 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9705 THEN '' 9706 ELSE concat("INFO", ';') 9707 END, 9708 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9709 ) 9710 FROM 9711 RankedTranscripts 9712 WHERE 9713 rn = 1 9714 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9715 AND variants."POS" = RankedTranscripts."POS" 9716 AND variants."REF" = RankedTranscripts."REF" 9717 AND variants."ALT" = RankedTranscripts."ALT" 9718 9719 """ 9720 self.execute_query(query=query_update) 9721 9722 # Add PZ Transcript in header 9723 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9724 pz_fields_transcripts, 9725 ".", 9726 "String", 9727 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9728 "unknown", 9729 "unknown", 9730 code_type_map["String"], 9731 ) 9732 9733 # Return 9734 return True 9735 9736 def create_transcript_view_from_columns_map( 9737 self, 9738 transcripts_table: str = "transcripts", 9739 columns_maps: dict = {}, 9740 added_columns: list = [], 9741 temporary_tables: list = None, 9742 annotation_fields: list = None, 9743 ) -> tuple[list, list, list]: 9744 """ 9745 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9746 specified columns mapping for transcripts data. 9747 9748 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9749 the table where the transcripts data is stored or will be stored in the database. This table 9750 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9751 predictions, etc. It defaults to "transcripts, defaults to transcripts 9752 :type transcripts_table: str (optional) 9753 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9754 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9755 represents a mapping configuration for a specific set of columns. It typically includes details such 9756 as the main transcript column and additional information columns 9757 :type columns_maps: dict 9758 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9759 function is a list that stores the additional columns that will be added to the view being created 9760 based on the columns map provided. These columns are generated by exploding the transcript 9761 information columns along with the main transcript column 9762 :type added_columns: list 9763 :param temporary_tables: The `temporary_tables` parameter in the 9764 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9765 tables created during the process of creating a transcript view from a columns map. These temporary 9766 tables are used to store intermediate results or transformations before the final view is generated 9767 :type temporary_tables: list 9768 :param annotation_fields: The `annotation_fields` parameter in the 9769 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9770 for annotation in the query view creation process. These fields are extracted from the 9771 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9772 :type annotation_fields: list 9773 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9774 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9775 """ 9776 9777 log.debug("Start transcrpts view creation from columns map...") 9778 9779 # "from_columns_map": [ 9780 # { 9781 # "transcripts_column": "Ensembl_transcriptid", 9782 # "transcripts_infos_columns": [ 9783 # "genename", 9784 # "Ensembl_geneid", 9785 # "LIST_S2_score", 9786 # "LIST_S2_pred", 9787 # ], 9788 # }, 9789 # { 9790 # "transcripts_column": "Ensembl_transcriptid", 9791 # "transcripts_infos_columns": [ 9792 # "genename", 9793 # "VARITY_R_score", 9794 # "Aloft_pred", 9795 # ], 9796 # }, 9797 # ], 9798 9799 # Init 9800 if temporary_tables is None: 9801 temporary_tables = [] 9802 if annotation_fields is None: 9803 annotation_fields = [] 9804 9805 # Variants table 9806 table_variants = self.get_table_variants() 9807 9808 for columns_map in columns_maps: 9809 9810 # Transcript column 9811 transcripts_column = columns_map.get("transcripts_column", None) 9812 9813 # Transcripts infos columns 9814 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9815 9816 if transcripts_column is not None: 9817 9818 # Explode 9819 added_columns += self.explode_infos( 9820 fields=[transcripts_column] + transcripts_infos_columns 9821 ) 9822 9823 # View clauses 9824 clause_select = [] 9825 for field in [transcripts_column] + transcripts_infos_columns: 9826 clause_select.append( 9827 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9828 ) 9829 if field not in [transcripts_column]: 9830 annotation_fields.append(field) 9831 9832 # Querey View 9833 query = f""" 9834 SELECT 9835 "#CHROM", POS, REF, ALT, INFO, 9836 "{transcripts_column}" AS 'transcript', 9837 {", ".join(clause_select)} 9838 FROM ( 9839 SELECT 9840 "#CHROM", POS, REF, ALT, INFO, 9841 {", ".join(clause_select)} 9842 FROM {table_variants} 9843 ) 9844 WHERE "{transcripts_column}" IS NOT NULL 9845 """ 9846 9847 # Create temporary table 9848 temporary_table = transcripts_table + "".join( 9849 random.choices(string.ascii_uppercase + string.digits, k=10) 9850 ) 9851 9852 # Temporary_tables 9853 temporary_tables.append(temporary_table) 9854 query_view = f""" 9855 CREATE TEMPORARY TABLE {temporary_table} 9856 AS ({query}) 9857 """ 9858 self.execute_query(query=query_view) 9859 9860 return added_columns, temporary_tables, annotation_fields 9861 9862 def create_transcript_view_from_column_format( 9863 self, 9864 transcripts_table: str = "transcripts", 9865 column_formats: dict = {}, 9866 temporary_tables: list = None, 9867 annotation_fields: list = None, 9868 ) -> tuple[list, list, list]: 9869 """ 9870 The `create_transcript_view_from_column_format` function generates a transcript view based on 9871 specified column formats, adds additional columns and annotation fields, and returns the list of 9872 temporary tables and annotation fields. 9873 9874 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9875 the table containing the transcripts data. This table will be used as the base table for creating 9876 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9877 different table name if needed, defaults to transcripts 9878 :type transcripts_table: str (optional) 9879 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9880 about the columns to be used for creating the transcript view. Each entry in the dictionary 9881 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9882 the provided code snippet: 9883 :type column_formats: dict 9884 :param temporary_tables: The `temporary_tables` parameter in the 9885 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9886 views created during the process of creating a transcript view from a column format. These temporary 9887 views are used to manipulate and extract data before generating the final transcript view. It 9888 :type temporary_tables: list 9889 :param annotation_fields: The `annotation_fields` parameter in the 9890 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9891 that are extracted from the temporary views created during the process. These annotation fields are 9892 obtained by querying the temporary views and extracting the column names excluding specific columns 9893 like `#CH 9894 :type annotation_fields: list 9895 :return: The `create_transcript_view_from_column_format` function returns two lists: 9896 `temporary_tables` and `annotation_fields`. 9897 """ 9898 9899 log.debug("Start transcrpts view creation from column format...") 9900 9901 # "from_column_format": [ 9902 # { 9903 # "transcripts_column": "ANN", 9904 # "transcripts_infos_column": "Feature_ID", 9905 # } 9906 # ], 9907 9908 # Init 9909 if temporary_tables is None: 9910 temporary_tables = [] 9911 if annotation_fields is None: 9912 annotation_fields = [] 9913 9914 for column_format in column_formats: 9915 9916 # annotation field and transcript annotation field 9917 annotation_field = column_format.get("transcripts_column", "ANN") 9918 transcript_annotation = column_format.get( 9919 "transcripts_infos_column", "Feature_ID" 9920 ) 9921 9922 # Temporary View name 9923 temporary_view_name = transcripts_table + "".join( 9924 random.choices(string.ascii_uppercase + string.digits, k=10) 9925 ) 9926 9927 # Create temporary view name 9928 temporary_view_name = self.annotation_format_to_table( 9929 uniquify=True, 9930 annotation_field=annotation_field, 9931 view_name=temporary_view_name, 9932 annotation_id=transcript_annotation, 9933 ) 9934 9935 # Annotation fields 9936 if temporary_view_name: 9937 query_annotation_fields = f""" 9938 SELECT * 9939 FROM ( 9940 DESCRIBE SELECT * 9941 FROM {temporary_view_name} 9942 ) 9943 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9944 """ 9945 df_annotation_fields = self.get_query_to_df( 9946 query=query_annotation_fields 9947 ) 9948 9949 # Add temporary view and annotation fields 9950 temporary_tables.append(temporary_view_name) 9951 annotation_fields += list(set(df_annotation_fields["column_name"])) 9952 9953 return temporary_tables, annotation_fields 9954 9955 def create_transcript_view( 9956 self, 9957 transcripts_table: str = None, 9958 transcripts_table_drop: bool = True, 9959 param: dict = {}, 9960 ) -> str: 9961 """ 9962 The `create_transcript_view` function generates a transcript view by processing data from a 9963 specified table based on provided parameters and structural information. 9964 9965 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9966 is used to specify the name of the table that will store the final transcript view data. If a table 9967 name is not provided, the function will create a new table to store the transcript view data, and by 9968 default,, defaults to transcripts 9969 :type transcripts_table: str (optional) 9970 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9971 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9972 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9973 the function will drop the existing transcripts table if it exists, defaults to True 9974 :type transcripts_table_drop: bool (optional) 9975 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9976 contains information needed to create a transcript view. It includes details such as the structure 9977 of the transcripts, columns mapping, column formats, and other necessary information for generating 9978 the view. This parameter allows for flexibility and customization 9979 :type param: dict 9980 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9981 created or modified during the execution of the function. 9982 """ 9983 9984 log.debug("Start transcripts view creation...") 9985 9986 # Default 9987 transcripts_table_default = "transcripts" 9988 9989 # Param 9990 if not param: 9991 param = self.get_param() 9992 9993 # Struct 9994 struct = param.get("transcripts", {}).get("struct", None) 9995 9996 if struct: 9997 9998 # Transcripts table 9999 if transcripts_table is None: 10000 transcripts_table = param.get("transcripts", {}).get( 10001 "table", transcripts_table_default 10002 ) 10003 10004 # added_columns 10005 added_columns = [] 10006 10007 # Temporary tables 10008 temporary_tables = [] 10009 10010 # Annotation fields 10011 annotation_fields = [] 10012 10013 # from columns map 10014 columns_maps = struct.get("from_columns_map", []) 10015 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10016 self.create_transcript_view_from_columns_map( 10017 transcripts_table=transcripts_table, 10018 columns_maps=columns_maps, 10019 added_columns=added_columns, 10020 temporary_tables=temporary_tables, 10021 annotation_fields=annotation_fields, 10022 ) 10023 ) 10024 added_columns += added_columns_tmp 10025 temporary_tables += temporary_tables_tmp 10026 annotation_fields += annotation_fields_tmp 10027 10028 # from column format 10029 column_formats = struct.get("from_column_format", []) 10030 temporary_tables_tmp, annotation_fields_tmp = ( 10031 self.create_transcript_view_from_column_format( 10032 transcripts_table=transcripts_table, 10033 column_formats=column_formats, 10034 temporary_tables=temporary_tables, 10035 annotation_fields=annotation_fields, 10036 ) 10037 ) 10038 temporary_tables += temporary_tables_tmp 10039 annotation_fields += annotation_fields_tmp 10040 10041 # Merge temporary tables query 10042 query_merge = "" 10043 for temporary_table in temporary_tables: 10044 10045 # First temporary table 10046 if not query_merge: 10047 query_merge = f""" 10048 SELECT * FROM {temporary_table} 10049 """ 10050 # other temporary table (using UNION) 10051 else: 10052 query_merge += f""" 10053 UNION BY NAME SELECT * FROM {temporary_table} 10054 """ 10055 10056 # Merge on transcript 10057 query_merge_on_transcripts_annotation_fields = [] 10058 # Aggregate all annotations fields 10059 for annotation_field in set(annotation_fields): 10060 query_merge_on_transcripts_annotation_fields.append( 10061 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10062 ) 10063 # Query for transcripts view 10064 query_merge_on_transcripts = f""" 10065 SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 10066 FROM ({query_merge}) 10067 GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript 10068 """ 10069 10070 # Drop transcript view is necessary 10071 if transcripts_table_drop: 10072 query_drop = f""" 10073 DROP TABLE IF EXISTS {transcripts_table}; 10074 """ 10075 self.execute_query(query=query_drop) 10076 10077 # Merge and create transcript view 10078 query_create_view = f""" 10079 CREATE TABLE IF NOT EXISTS {transcripts_table} 10080 AS {query_merge_on_transcripts} 10081 """ 10082 self.execute_query(query=query_create_view) 10083 10084 # Remove added columns 10085 for added_column in added_columns: 10086 self.drop_column(column=added_column) 10087 10088 else: 10089 10090 transcripts_table = None 10091 10092 return transcripts_table 10093 10094 def annotation_format_to_table( 10095 self, 10096 uniquify: bool = True, 10097 annotation_field: str = "ANN", 10098 annotation_id: str = "Feature_ID", 10099 view_name: str = "transcripts", 10100 ) -> str: 10101 """ 10102 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 10103 table format. 10104 10105 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 10106 values in the output or not. If set to `True`, the function will make sure that the output values 10107 are unique, defaults to True 10108 :type uniquify: bool (optional) 10109 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 10110 contains the annotation information for each variant. This field is used to extract the annotation 10111 details for further processing in the function, defaults to ANN 10112 :type annotation_field: str (optional) 10113 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 10114 used to specify the identifier for the annotation feature. This identifier will be used as a column 10115 name in the resulting table or view that is created based on the annotation data. It helps in 10116 uniquely identifying each annotation entry in the, defaults to Feature_ID 10117 :type annotation_id: str (optional) 10118 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 10119 specify the name of the temporary table that will be created to store the transformed annotation 10120 data. This table will hold the extracted information from the annotation field in a structured 10121 format for further processing or analysis, defaults to transcripts 10122 :type view_name: str (optional) 10123 :return: The function `annotation_format_to_table` is returning the name of the view created, which 10124 is stored in the variable `view_name`. 10125 """ 10126 10127 # Annotation field 10128 annotation_format = "annotation_explode" 10129 10130 # Transcript annotation 10131 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 10132 10133 # Prefix 10134 prefix = self.get_explode_infos_prefix() 10135 if prefix: 10136 prefix = "INFO/" 10137 10138 # Annotation fields 10139 annotation_infos = prefix + annotation_field 10140 annotation_format_infos = prefix + annotation_format 10141 10142 # Variants table 10143 table_variants = self.get_table_variants() 10144 10145 # Header 10146 vcf_reader = self.get_header() 10147 10148 # Add columns 10149 added_columns = [] 10150 10151 # Explode HGVS field in column 10152 added_columns += self.explode_infos(fields=[annotation_field]) 10153 10154 if annotation_field in vcf_reader.infos: 10155 10156 # Extract ANN header 10157 ann_description = vcf_reader.infos[annotation_field].desc 10158 pattern = r"'(.+?)'" 10159 match = re.search(pattern, ann_description) 10160 if match: 10161 ann_header_match = match.group(1).split(" | ") 10162 ann_header = [] 10163 ann_header_desc = {} 10164 for i in range(len(ann_header_match)): 10165 ann_header_info = "".join( 10166 char for char in ann_header_match[i] if char.isalnum() 10167 ) 10168 ann_header.append(ann_header_info) 10169 ann_header_desc[ann_header_info] = ann_header_match[i] 10170 if not ann_header_desc: 10171 raise ValueError("Invalid header description format") 10172 else: 10173 raise ValueError("Invalid header description format") 10174 10175 # Create variant id 10176 variant_id_column = self.get_variant_id_column() 10177 added_columns += [variant_id_column] 10178 10179 # Create dataframe 10180 dataframe_annotation_format = self.get_query_to_df( 10181 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 10182 ) 10183 10184 # Create annotation columns 10185 dataframe_annotation_format[ 10186 annotation_format_infos 10187 ] = dataframe_annotation_format[annotation_infos].apply( 10188 lambda x: explode_annotation_format( 10189 annotation=str(x), 10190 uniquify=uniquify, 10191 output_format="JSON", 10192 prefix="", 10193 header=list(ann_header_desc.values()), 10194 ) 10195 ) 10196 10197 # Find keys 10198 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 10199 df_keys = self.get_query_to_df(query=query_json) 10200 10201 # Check keys 10202 query_json_key = [] 10203 for _, row in df_keys.iterrows(): 10204 10205 # Key 10206 key = row.iloc[0] 10207 10208 # key_clean 10209 key_clean = "".join(char for char in key if char.isalnum()) 10210 10211 # Type 10212 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 10213 10214 # Get DataFrame from query 10215 df_json_type = self.get_query_to_df(query=query_json_type) 10216 10217 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 10218 with pd.option_context("future.no_silent_downcasting", True): 10219 df_json_type.fillna(value="", inplace=True) 10220 replace_dict = {None: np.nan, "": np.nan} 10221 df_json_type.replace(replace_dict, inplace=True) 10222 df_json_type.dropna(inplace=True) 10223 10224 # Detect column type 10225 column_type = detect_column_type(df_json_type[key_clean]) 10226 10227 # Append 10228 query_json_key.append( 10229 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 10230 ) 10231 10232 # Create view 10233 query_view = f""" 10234 CREATE TEMPORARY TABLE {view_name} 10235 AS ( 10236 SELECT *, {annotation_id} AS 'transcript' 10237 FROM ( 10238 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 10239 FROM dataframe_annotation_format 10240 ) 10241 ); 10242 """ 10243 self.execute_query(query=query_view) 10244 10245 else: 10246 10247 # Return None 10248 view_name = None 10249 10250 # Remove added columns 10251 for added_column in added_columns: 10252 self.drop_column(column=added_column) 10253 10254 return view_name 10255 10256 def transcript_view_to_variants( 10257 self, 10258 transcripts_table: str = None, 10259 transcripts_column_id: str = None, 10260 transcripts_info_json: str = None, 10261 transcripts_info_field_json: str = None, 10262 transcripts_info_format: str = None, 10263 transcripts_info_field_format: str = None, 10264 param: dict = {}, 10265 ) -> bool: 10266 """ 10267 The `transcript_view_to_variants` function updates a variants table with information from 10268 transcripts in JSON format. 10269 10270 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 10271 table containing the transcripts data. If this parameter is not provided, the function will 10272 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 10273 :type transcripts_table: str 10274 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 10275 column in the `transcripts_table` that contains the unique identifier for each transcript. This 10276 identifier is used to match transcripts with variants in the database 10277 :type transcripts_column_id: str 10278 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 10279 of the column in the variants table where the transcripts information will be stored in JSON 10280 format. This parameter allows you to define the column in the variants table that will hold the 10281 JSON-formatted information about transcripts 10282 :type transcripts_info_json: str 10283 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 10284 specify the field in the VCF header that will contain information about transcripts in JSON 10285 format. This field will be added to the VCF header as an INFO field with the specified name 10286 :type transcripts_info_field_json: str 10287 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 10288 format of the information about transcripts that will be stored in the variants table. This 10289 format can be used to define how the transcript information will be structured or displayed 10290 within the variants table 10291 :type transcripts_info_format: str 10292 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 10293 specify the field in the VCF header that will contain information about transcripts in a 10294 specific format. This field will be added to the VCF header as an INFO field with the specified 10295 name 10296 :type transcripts_info_field_format: str 10297 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 10298 that contains various configuration settings related to transcripts. It is used to provide 10299 default values for certain parameters if they are not explicitly provided when calling the 10300 method. The `param` dictionary can be passed as an argument 10301 :type param: dict 10302 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 10303 if the operation is successful and `False` if certain conditions are not met. 10304 """ 10305 10306 msg_info_prefix = "Start transcripts view to variants annotations" 10307 10308 log.debug(f"{msg_info_prefix}...") 10309 10310 # Default 10311 transcripts_table_default = "transcripts" 10312 transcripts_column_id_default = "transcript" 10313 transcripts_info_json_default = None 10314 transcripts_info_format_default = None 10315 transcripts_info_field_json_default = None 10316 transcripts_info_field_format_default = None 10317 10318 # Param 10319 if not param: 10320 param = self.get_param() 10321 10322 # Transcripts table 10323 if transcripts_table is None: 10324 transcripts_table = param.get("transcripts", {}).get( 10325 "table", transcripts_table_default 10326 ) 10327 10328 # Transcripts column ID 10329 if transcripts_column_id is None: 10330 transcripts_column_id = param.get("transcripts", {}).get( 10331 "column_id", transcripts_column_id_default 10332 ) 10333 10334 # Transcripts info json 10335 if transcripts_info_json is None: 10336 transcripts_info_json = param.get("transcripts", {}).get( 10337 "transcripts_info_json", transcripts_info_json_default 10338 ) 10339 10340 # Transcripts info field JSON 10341 if transcripts_info_field_json is None: 10342 transcripts_info_field_json = param.get("transcripts", {}).get( 10343 "transcripts_info_field_json", transcripts_info_field_json_default 10344 ) 10345 # if transcripts_info_field_json is not None and transcripts_info_json is None: 10346 # transcripts_info_json = transcripts_info_field_json 10347 10348 # Transcripts info format 10349 if transcripts_info_format is None: 10350 transcripts_info_format = param.get("transcripts", {}).get( 10351 "transcripts_info_format", transcripts_info_format_default 10352 ) 10353 10354 # Transcripts info field FORMAT 10355 if transcripts_info_field_format is None: 10356 transcripts_info_field_format = param.get("transcripts", {}).get( 10357 "transcripts_info_field_format", transcripts_info_field_format_default 10358 ) 10359 # if ( 10360 # transcripts_info_field_format is not None 10361 # and transcripts_info_format is None 10362 # ): 10363 # transcripts_info_format = transcripts_info_field_format 10364 10365 # Variants table 10366 table_variants = self.get_table_variants() 10367 10368 # Check info columns param 10369 if ( 10370 transcripts_info_json is None 10371 and transcripts_info_field_json is None 10372 and transcripts_info_format is None 10373 and transcripts_info_field_format is None 10374 ): 10375 return False 10376 10377 # Transcripts infos columns 10378 query_transcripts_infos_columns = f""" 10379 SELECT * 10380 FROM ( 10381 DESCRIBE SELECT * FROM {transcripts_table} 10382 ) 10383 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10384 """ 10385 transcripts_infos_columns = list( 10386 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10387 ) 10388 10389 # View results 10390 clause_select = [] 10391 clause_to_json = [] 10392 clause_to_format = [] 10393 for field in transcripts_infos_columns: 10394 clause_select.append( 10395 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10396 ) 10397 clause_to_json.append(f""" '{field}': "{field}" """) 10398 clause_to_format.append(f""" "{field}" """) 10399 10400 # Update 10401 update_set_json = [] 10402 update_set_format = [] 10403 10404 # VCF header 10405 vcf_reader = self.get_header() 10406 10407 # Transcripts to info column in JSON 10408 if transcripts_info_json is not None: 10409 10410 # Create column on variants table 10411 self.add_column( 10412 table_name=table_variants, 10413 column_name=transcripts_info_json, 10414 column_type="JSON", 10415 default_value=None, 10416 drop=False, 10417 ) 10418 10419 # Add header 10420 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10421 transcripts_info_json, 10422 ".", 10423 "String", 10424 "Transcripts in JSON format", 10425 "unknwon", 10426 "unknwon", 10427 self.code_type_map["String"], 10428 ) 10429 10430 # Add to update 10431 update_set_json.append( 10432 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10433 ) 10434 10435 # Transcripts to info field in JSON 10436 if transcripts_info_field_json is not None: 10437 10438 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 10439 10440 # Add to update 10441 update_set_json.append( 10442 f""" 10443 INFO = concat( 10444 CASE 10445 WHEN INFO NOT IN ('', '.') 10446 THEN INFO 10447 ELSE '' 10448 END, 10449 CASE 10450 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10451 THEN concat( 10452 ';{transcripts_info_field_json}=', 10453 t.{transcripts_info_json} 10454 ) 10455 ELSE '' 10456 END 10457 ) 10458 """ 10459 ) 10460 10461 # Add header 10462 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 10463 transcripts_info_field_json, 10464 ".", 10465 "String", 10466 "Transcripts in JSON format", 10467 "unknwon", 10468 "unknwon", 10469 self.code_type_map["String"], 10470 ) 10471 10472 if update_set_json: 10473 10474 # Update query 10475 query_update = f""" 10476 UPDATE {table_variants} 10477 SET {", ".join(update_set_json)} 10478 FROM 10479 ( 10480 SELECT 10481 "#CHROM", POS, REF, ALT, 10482 concat( 10483 '{{', 10484 string_agg( 10485 '"' || "{transcripts_column_id}" || '":' || 10486 to_json(json_output) 10487 ), 10488 '}}' 10489 )::JSON AS {transcripts_info_json} 10490 FROM 10491 ( 10492 SELECT 10493 "#CHROM", POS, REF, ALT, 10494 "{transcripts_column_id}", 10495 to_json( 10496 {{{",".join(clause_to_json)}}} 10497 )::JSON AS json_output 10498 FROM 10499 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10500 WHERE "{transcripts_column_id}" IS NOT NULL 10501 ) 10502 GROUP BY "#CHROM", POS, REF, ALT 10503 ) AS t 10504 WHERE {table_variants}."#CHROM" = t."#CHROM" 10505 AND {table_variants}."POS" = t."POS" 10506 AND {table_variants}."REF" = t."REF" 10507 AND {table_variants}."ALT" = t."ALT" 10508 """ 10509 10510 self.execute_query(query=query_update) 10511 10512 # Transcripts to info column in FORMAT 10513 if transcripts_info_format is not None: 10514 10515 # Create column on variants table 10516 self.add_column( 10517 table_name=table_variants, 10518 column_name=transcripts_info_format, 10519 column_type="VARCHAR", 10520 default_value=None, 10521 drop=False, 10522 ) 10523 10524 # Add header 10525 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 10526 transcripts_info_format, 10527 ".", 10528 "String", 10529 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10530 "unknwon", 10531 "unknwon", 10532 self.code_type_map["String"], 10533 ) 10534 10535 # Add to update 10536 update_set_format.append( 10537 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 10538 ) 10539 10540 # Transcripts to info field in JSON 10541 if transcripts_info_field_format is not None: 10542 10543 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 10544 10545 # Add to update 10546 update_set_format.append( 10547 f""" 10548 INFO = concat( 10549 CASE 10550 WHEN INFO NOT IN ('', '.') 10551 THEN INFO 10552 ELSE '' 10553 END, 10554 CASE 10555 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 10556 THEN concat( 10557 ';{transcripts_info_field_format}=', 10558 t.{transcripts_info_format} 10559 ) 10560 ELSE '' 10561 END 10562 ) 10563 """ 10564 ) 10565 10566 # Add header 10567 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 10568 transcripts_info_field_format, 10569 ".", 10570 "String", 10571 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10572 "unknwon", 10573 "unknwon", 10574 self.code_type_map["String"], 10575 ) 10576 10577 if update_set_format: 10578 10579 # Update query 10580 query_update = f""" 10581 UPDATE {table_variants} 10582 SET {", ".join(update_set_format)} 10583 FROM 10584 ( 10585 SELECT 10586 "#CHROM", POS, REF, ALT, 10587 string_agg({transcripts_info_format}) AS {transcripts_info_format} 10588 FROM 10589 ( 10590 SELECT 10591 "#CHROM", POS, REF, ALT, 10592 "{transcripts_column_id}", 10593 concat( 10594 "{transcripts_column_id}", 10595 '|', 10596 {", '|', ".join(clause_to_format)} 10597 ) AS {transcripts_info_format} 10598 FROM 10599 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10600 ) 10601 GROUP BY "#CHROM", POS, REF, ALT 10602 ) AS t 10603 WHERE {table_variants}."#CHROM" = t."#CHROM" 10604 AND {table_variants}."POS" = t."POS" 10605 AND {table_variants}."REF" = t."REF" 10606 AND {table_variants}."ALT" = t."ALT" 10607 """ 10608 10609 self.execute_query(query=query_update) 10610 10611 return True
34class Variants: 35 36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Samples 78 self.set_samples() 79 80 # Load data 81 if load: 82 self.load_data() 83 84 def set_samples(self, samples: list = None) -> list: 85 """ 86 The function `set_samples` sets the samples attribute of an object to a provided list or 87 retrieves it from a parameter dictionary. 88 89 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 90 input and sets the `samples` attribute of the class to the provided list. If no samples are 91 provided, it tries to get the samples from the class's parameters using the `get_param` method 92 :type samples: list 93 :return: The `samples` list is being returned. 94 """ 95 96 if not samples: 97 samples = self.get_param().get("samples", {}).get("list", None) 98 99 self.samples = samples 100 101 return samples 102 103 def get_samples(self) -> list: 104 """ 105 This function returns a list of samples. 106 :return: The `get_samples` method is returning the `samples` attribute of the object. 107 """ 108 109 return self.samples 110 111 def get_samples_check(self) -> bool: 112 """ 113 This function returns the value of the "check" key within the "samples" dictionary retrieved 114 from the parameters. 115 :return: The method `get_samples_check` is returning the value of the key "check" inside the 116 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 117 method. If the key "check" is not found, it will return `False`. 118 """ 119 120 return self.get_param().get("samples", {}).get("check", True) 121 122 def set_input(self, input: str = None) -> None: 123 """ 124 The function `set_input` takes a file name as input, extracts the name and extension, and sets 125 attributes in the class accordingly. 126 127 :param input: The `set_input` method in the provided code snippet is used to set attributes 128 related to the input file. Here's a breakdown of the parameters and their usage in the method: 129 :type input: str 130 """ 131 132 if input and not isinstance(input, str): 133 try: 134 self.input = input.name 135 except: 136 log.error(f"Input file '{input} in bad format") 137 raise ValueError(f"Input file '{input} in bad format") 138 else: 139 self.input = input 140 141 # Input format 142 if input: 143 input_name, input_extension = os.path.splitext(self.input) 144 self.input_name = input_name 145 self.input_extension = input_extension 146 self.input_format = self.input_extension.replace(".", "") 147 148 def set_config(self, config: dict) -> None: 149 """ 150 The set_config function takes a config object and assigns it as the configuration object for the 151 class. 152 153 :param config: The `config` parameter in the `set_config` function is a dictionary object that 154 contains configuration settings for the class. When you call the `set_config` function with a 155 dictionary object as the argument, it will set that dictionary as the configuration object for 156 the class 157 :type config: dict 158 """ 159 160 self.config = config 161 162 def set_param(self, param: dict) -> None: 163 """ 164 This function sets a parameter object for the class based on the input dictionary. 165 166 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 167 as the `param` attribute of the class instance 168 :type param: dict 169 """ 170 171 self.param = param 172 173 def init_variables(self) -> None: 174 """ 175 This function initializes the variables that will be used in the rest of the class 176 """ 177 178 self.prefix = "howard" 179 self.table_variants = "variants" 180 self.dataframe = None 181 182 self.comparison_map = { 183 "gt": ">", 184 "gte": ">=", 185 "lt": "<", 186 "lte": "<=", 187 "equals": "=", 188 "contains": "SIMILAR TO", 189 } 190 191 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 192 193 self.code_type_map_to_sql = { 194 "Integer": "INTEGER", 195 "String": "VARCHAR", 196 "Float": "FLOAT", 197 "Flag": "VARCHAR", 198 } 199 200 self.index_additionnal_fields = [] 201 202 def get_indexing(self) -> bool: 203 """ 204 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 205 returns False. 206 :return: The value of the indexing parameter. 207 """ 208 209 return self.get_param().get("indexing", False) 210 211 def get_connexion_config(self) -> dict: 212 """ 213 The function `get_connexion_config` returns a dictionary containing the configuration for a 214 connection, including the number of threads and memory limit. 215 :return: a dictionary containing the configuration for the Connexion library. 216 """ 217 218 # config 219 config = self.get_config() 220 221 # Connexion config 222 connexion_config = {} 223 threads = self.get_threads() 224 225 # Threads 226 if threads: 227 connexion_config["threads"] = threads 228 229 # Memory 230 # if config.get("memory", None): 231 # connexion_config["memory_limit"] = config.get("memory") 232 if self.get_memory(): 233 connexion_config["memory_limit"] = self.get_memory() 234 235 # Temporary directory 236 if config.get("tmp", None): 237 connexion_config["temp_directory"] = config.get("tmp") 238 239 # Access 240 if config.get("access", None): 241 access = config.get("access") 242 if access in ["RO"]: 243 access = "READ_ONLY" 244 elif access in ["RW"]: 245 access = "READ_WRITE" 246 connexion_db = self.get_connexion_db() 247 if connexion_db in ":memory:": 248 access = "READ_WRITE" 249 connexion_config["access_mode"] = access 250 251 return connexion_config 252 253 def get_duckdb_settings(self) -> dict: 254 """ 255 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 256 string. 257 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 258 """ 259 260 # config 261 config = self.get_config() 262 263 # duckdb settings 264 duckdb_settings_dict = {} 265 if config.get("duckdb_settings", None): 266 duckdb_settings = config.get("duckdb_settings") 267 duckdb_settings = full_path(duckdb_settings) 268 # duckdb setting is a file 269 if os.path.exists(duckdb_settings): 270 with open(duckdb_settings) as json_file: 271 duckdb_settings_dict = yaml.safe_load(json_file) 272 # duckdb settings is a string 273 else: 274 duckdb_settings_dict = json.loads(duckdb_settings) 275 276 return duckdb_settings_dict 277 278 def set_connexion_db(self) -> str: 279 """ 280 The function `set_connexion_db` returns the appropriate database connection string based on the 281 input format and connection type. 282 :return: the value of the variable `connexion_db`. 283 """ 284 285 # Default connexion db 286 default_connexion_db = ":memory:" 287 288 # Find connexion db 289 if self.get_input_format() in ["db", "duckdb"]: 290 connexion_db = self.get_input() 291 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 292 connexion_db = default_connexion_db 293 elif self.get_connexion_type() in ["tmpfile"]: 294 tmp_name = tempfile.mkdtemp( 295 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 296 ) 297 connexion_db = f"{tmp_name}/tmp.db" 298 elif self.get_connexion_type() != "": 299 connexion_db = self.get_connexion_type() 300 else: 301 connexion_db = default_connexion_db 302 303 # Set connexion db 304 self.connexion_db = connexion_db 305 306 return connexion_db 307 308 def set_connexion(self, conn) -> None: 309 """ 310 The function `set_connexion` creates a connection to a database, with options for different 311 database formats and settings. 312 313 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 314 database. If a connection is not provided, a new connection to an in-memory database is created. 315 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 316 sqlite 317 """ 318 319 # Connexion db 320 connexion_db = self.set_connexion_db() 321 322 # Connexion config 323 connexion_config = self.get_connexion_config() 324 325 # Connexion format 326 connexion_format = self.get_config().get("connexion_format", "duckdb") 327 # Set connexion format 328 self.connexion_format = connexion_format 329 330 # Connexion 331 if not conn: 332 if connexion_format in ["duckdb"]: 333 conn = duckdb.connect(connexion_db, config=connexion_config) 334 # duckDB settings 335 duckdb_settings = self.get_duckdb_settings() 336 if duckdb_settings: 337 for setting in duckdb_settings: 338 setting_value = duckdb_settings.get(setting) 339 if isinstance(setting_value, str): 340 setting_value = f"'{setting_value}'" 341 conn.execute(f"PRAGMA {setting}={setting_value};") 342 elif connexion_format in ["sqlite"]: 343 conn = sqlite3.connect(connexion_db) 344 345 # Set connexion 346 self.conn = conn 347 348 # Log 349 log.debug(f"connexion_format: {connexion_format}") 350 log.debug(f"connexion_db: {connexion_db}") 351 log.debug(f"connexion config: {connexion_config}") 352 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 353 354 def set_output(self, output: str = None) -> None: 355 """ 356 The `set_output` function in Python sets the output file based on the input or a specified key 357 in the config file, extracting the output name, extension, and format. 358 359 :param output: The `output` parameter in the `set_output` method is used to specify the name of 360 the output file. If the config file has an 'output' key, the method sets the output to the value 361 of that key. If no output is provided, it sets the output to `None` 362 :type output: str 363 """ 364 365 if output and not isinstance(output, str): 366 self.output = output.name 367 else: 368 self.output = output 369 370 # Output format 371 if self.output: 372 output_name, output_extension = os.path.splitext(self.output) 373 self.output_name = output_name 374 self.output_extension = output_extension 375 self.output_format = self.output_extension.replace(".", "") 376 else: 377 self.output_name = None 378 self.output_extension = None 379 self.output_format = None 380 381 def set_header(self) -> None: 382 """ 383 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 384 """ 385 386 input_file = self.get_input() 387 default_header_list = [ 388 "##fileformat=VCFv4.2", 389 "#CHROM POS ID REF ALT QUAL FILTER INFO", 390 ] 391 392 # Full path 393 input_file = full_path(input_file) 394 395 if input_file: 396 397 input_format = self.get_input_format() 398 input_compressed = self.get_input_compressed() 399 config = self.get_config() 400 header_list = default_header_list 401 if input_format in [ 402 "vcf", 403 "hdr", 404 "tsv", 405 "csv", 406 "psv", 407 "parquet", 408 "db", 409 "duckdb", 410 ]: 411 # header provided in param 412 if config.get("header_file", None): 413 with open(config.get("header_file"), "rt") as f: 414 header_list = self.read_vcf_header(f) 415 # within a vcf file format (header within input file itsself) 416 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 417 # within a compressed vcf file format (.vcf.gz) 418 if input_compressed: 419 with bgzf.open(input_file, "rt") as f: 420 header_list = self.read_vcf_header(f) 421 # within an uncompressed vcf file format (.vcf) 422 else: 423 with open(input_file, "rt") as f: 424 header_list = self.read_vcf_header(f) 425 # header provided in default external file .hdr 426 elif os.path.exists((input_file + ".hdr")): 427 with open(input_file + ".hdr", "rt") as f: 428 header_list = self.read_vcf_header(f) 429 else: 430 try: # Try to get header info fields and file columns 431 432 with tempfile.TemporaryDirectory() as tmpdir: 433 434 # Create database 435 db_for_header = Database(database=input_file) 436 437 # Get header columns for infos fields 438 db_header_from_columns = ( 439 db_for_header.get_header_from_columns() 440 ) 441 442 # Get real columns in the file 443 db_header_columns = db_for_header.get_columns() 444 445 # Write header file 446 header_file_tmp = os.path.join(tmpdir, "header") 447 f = open(header_file_tmp, "w") 448 vcf.Writer(f, db_header_from_columns) 449 f.close() 450 451 # Replace #CHROM line with rel columns 452 header_list = db_for_header.read_header_file( 453 header_file=header_file_tmp 454 ) 455 header_list[-1] = "\t".join(db_header_columns) 456 457 except: 458 459 log.warning( 460 f"No header for file {input_file}. Set as default VCF header" 461 ) 462 header_list = default_header_list 463 464 else: # try for unknown format ? 465 466 log.error(f"Input file format '{input_format}' not available") 467 raise ValueError(f"Input file format '{input_format}' not available") 468 469 if not header_list: 470 header_list = default_header_list 471 472 # header as list 473 self.header_list = header_list 474 475 # header as VCF object 476 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 477 478 else: 479 480 self.header_list = None 481 self.header_vcf = None 482 483 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 484 """ 485 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 486 DataFrame based on the connection format. 487 488 :param query: The `query` parameter in the `get_query_to_df` function is a string that 489 represents the SQL query you want to execute. This query will be used to fetch data from a 490 database and convert it into a pandas DataFrame 491 :type query: str 492 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 493 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 494 function will only fetch up to that number of rows from the database query result. If no limit 495 is specified, 496 :type limit: int 497 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 498 """ 499 500 # Connexion format 501 connexion_format = self.get_connexion_format() 502 503 # Limit in query 504 if limit: 505 pd.set_option("display.max_rows", limit) 506 if connexion_format in ["duckdb"]: 507 df = ( 508 self.conn.execute(query) 509 .fetch_record_batch(limit) 510 .read_next_batch() 511 .to_pandas() 512 ) 513 elif connexion_format in ["sqlite"]: 514 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 515 516 # Full query 517 else: 518 if connexion_format in ["duckdb"]: 519 df = self.conn.execute(query).df() 520 elif connexion_format in ["sqlite"]: 521 df = pd.read_sql_query(query, self.conn) 522 523 return df 524 525 def get_overview(self) -> None: 526 """ 527 The function prints the input, output, config, and dataframe of the current object 528 """ 529 table_variants_from = self.get_table_variants(clause="from") 530 sql_columns = self.get_header_columns_as_sql() 531 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 532 df = self.get_query_to_df(sql_query_export) 533 log.info( 534 "Input: " 535 + str(self.get_input()) 536 + " [" 537 + str(str(self.get_input_format())) 538 + "]" 539 ) 540 log.info( 541 "Output: " 542 + str(self.get_output()) 543 + " [" 544 + str(str(self.get_output_format())) 545 + "]" 546 ) 547 log.info("Config: ") 548 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 549 "\n" 550 ): 551 log.info("\t" + str(d)) 552 log.info("Param: ") 553 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 554 "\n" 555 ): 556 log.info("\t" + str(d)) 557 log.info("Sample list: " + str(self.get_header_sample_list())) 558 log.info("Dataframe: ") 559 for d in str(df).split("\n"): 560 log.info("\t" + str(d)) 561 562 # garbage collector 563 del df 564 gc.collect() 565 566 return None 567 568 def get_stats(self) -> dict: 569 """ 570 The `get_stats` function calculates and returns various statistics of the current object, 571 including information about the input file, variants, samples, header fields, quality, and 572 SNVs/InDels. 573 :return: a dictionary containing various statistics of the current object. The dictionary has 574 the following structure: 575 """ 576 577 # Log 578 log.info(f"Stats Calculation...") 579 580 # table varaints 581 table_variants_from = self.get_table_variants() 582 583 # stats dict 584 stats = {"Infos": {}} 585 586 ### File 587 input_file = self.get_input() 588 stats["Infos"]["Input file"] = input_file 589 590 # Header 591 header_infos = self.get_header().infos 592 header_formats = self.get_header().formats 593 header_infos_list = list(header_infos) 594 header_formats_list = list(header_formats) 595 596 ### Variants 597 598 stats["Variants"] = {} 599 600 # Variants by chr 601 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 602 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 603 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 604 by=["CHROM"], kind="quicksort" 605 ) 606 607 # Total number of variants 608 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 609 610 # Calculate percentage 611 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 612 lambda x: (x / nb_of_variants) 613 ) 614 615 stats["Variants"]["Number of variants by chromosome"] = ( 616 nb_of_variants_by_chrom.to_dict(orient="index") 617 ) 618 619 stats["Infos"]["Number of variants"] = int(nb_of_variants) 620 621 ### Samples 622 623 # Init 624 samples = {} 625 nb_of_samples = 0 626 627 # Check Samples 628 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 629 log.debug(f"Check samples...") 630 for sample in self.get_header_sample_list(): 631 sql_query_samples = f""" 632 SELECT '{sample}' as sample, 633 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 634 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 635 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 636 FROM {table_variants_from} 637 WHERE ( 638 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 639 AND 640 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 641 ) 642 GROUP BY genotype 643 """ 644 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 645 sample_genotype_count = sql_query_genotype_df["count"].sum() 646 if len(sql_query_genotype_df): 647 nb_of_samples += 1 648 samples[f"{sample} - {sample_genotype_count} variants"] = ( 649 sql_query_genotype_df.to_dict(orient="index") 650 ) 651 652 stats["Samples"] = samples 653 stats["Infos"]["Number of samples"] = nb_of_samples 654 655 # # 656 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 657 # stats["Infos"]["Number of samples"] = nb_of_samples 658 # elif nb_of_samples: 659 # stats["Infos"]["Number of samples"] = "not a VCF format" 660 661 ### INFO and FORMAT fields 662 header_types_df = {} 663 header_types_list = { 664 "List of INFO fields": header_infos, 665 "List of FORMAT fields": header_formats, 666 } 667 i = 0 668 for header_type in header_types_list: 669 670 header_type_infos = header_types_list.get(header_type) 671 header_infos_dict = {} 672 673 for info in header_type_infos: 674 675 i += 1 676 header_infos_dict[i] = {} 677 678 # ID 679 header_infos_dict[i]["id"] = info 680 681 # num 682 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 683 if header_type_infos[info].num in genotype_map.keys(): 684 header_infos_dict[i]["Number"] = genotype_map.get( 685 header_type_infos[info].num 686 ) 687 else: 688 header_infos_dict[i]["Number"] = header_type_infos[info].num 689 690 # type 691 if header_type_infos[info].type: 692 header_infos_dict[i]["Type"] = header_type_infos[info].type 693 else: 694 header_infos_dict[i]["Type"] = "." 695 696 # desc 697 if header_type_infos[info].desc != None: 698 header_infos_dict[i]["Description"] = header_type_infos[info].desc 699 else: 700 header_infos_dict[i]["Description"] = "" 701 702 if len(header_infos_dict): 703 header_types_df[header_type] = pd.DataFrame.from_dict( 704 header_infos_dict, orient="index" 705 ).to_dict(orient="index") 706 707 # Stats 708 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 709 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 710 stats["Header"] = header_types_df 711 712 ### QUAL 713 if "QUAL" in self.get_header_columns(): 714 sql_query_qual = f""" 715 SELECT 716 avg(CAST(QUAL AS INTEGER)) AS Average, 717 min(CAST(QUAL AS INTEGER)) AS Minimum, 718 max(CAST(QUAL AS INTEGER)) AS Maximum, 719 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 720 median(CAST(QUAL AS INTEGER)) AS Median, 721 variance(CAST(QUAL AS INTEGER)) AS Variance 722 FROM {table_variants_from} 723 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 724 """ 725 726 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 727 stats["Quality"] = {"Stats": qual} 728 729 ### SNV and InDel 730 731 sql_query_snv = f""" 732 733 SELECT Type, count FROM ( 734 735 SELECT 736 'Total' AS Type, 737 count(*) AS count 738 FROM {table_variants_from} 739 740 UNION 741 742 SELECT 743 'MNV' AS Type, 744 count(*) AS count 745 FROM {table_variants_from} 746 WHERE len(REF) > 1 AND len(ALT) > 1 747 AND len(REF) = len(ALT) 748 749 UNION 750 751 SELECT 752 'InDel' AS Type, 753 count(*) AS count 754 FROM {table_variants_from} 755 WHERE len(REF) > 1 OR len(ALT) > 1 756 AND len(REF) != len(ALT) 757 758 UNION 759 760 SELECT 761 'SNV' AS Type, 762 count(*) AS count 763 FROM {table_variants_from} 764 WHERE len(REF) = 1 AND len(ALT) = 1 765 766 ) 767 768 ORDER BY count DESC 769 770 """ 771 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 772 773 sql_query_snv_substitution = f""" 774 SELECT 775 concat(REF, '>', ALT) AS 'Substitution', 776 count(*) AS count 777 FROM {table_variants_from} 778 WHERE len(REF) = 1 AND len(ALT) = 1 779 GROUP BY REF, ALT 780 ORDER BY count(*) DESC 781 """ 782 snv_substitution = ( 783 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 784 ) 785 stats["Variants"]["Counts"] = snv_indel 786 stats["Variants"]["Substitutions"] = snv_substitution 787 788 return stats 789 790 def stats_to_file(self, file: str = None) -> str: 791 """ 792 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 793 into a JSON object, and writes the JSON object to the specified file. 794 795 :param file: The `file` parameter is a string that represents the file path where the JSON data 796 will be written 797 :type file: str 798 :return: the name of the file that was written to. 799 """ 800 801 # Get stats 802 stats = self.get_stats() 803 804 # Serializing json 805 json_object = json.dumps(stats, indent=4) 806 807 # Writing to sample.json 808 with open(file, "w") as outfile: 809 outfile.write(json_object) 810 811 return file 812 813 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 814 """ 815 The `print_stats` function generates a markdown file and prints the statistics contained in a 816 JSON file in a formatted manner. 817 818 :param output_file: The `output_file` parameter is a string that specifies the path and filename 819 of the output file where the stats will be printed in Markdown format. If no `output_file` is 820 provided, a temporary directory will be created and the stats will be saved in a file named 821 "stats.md" within that 822 :type output_file: str 823 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 824 file where the statistics will be saved. If no value is provided, a temporary directory will be 825 created and a default file name "stats.json" will be used 826 :type json_file: str 827 :return: The function `print_stats` does not return any value. It has a return type annotation 828 of `None`. 829 """ 830 831 # Full path 832 output_file = full_path(output_file) 833 json_file = full_path(json_file) 834 835 with tempfile.TemporaryDirectory() as tmpdir: 836 837 # Files 838 if not output_file: 839 output_file = os.path.join(tmpdir, "stats.md") 840 if not json_file: 841 json_file = os.path.join(tmpdir, "stats.json") 842 843 # Create folders 844 if not os.path.exists(os.path.dirname(output_file)): 845 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 846 if not os.path.exists(os.path.dirname(json_file)): 847 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 848 849 # Create stats JSON file 850 stats_file = self.stats_to_file(file=json_file) 851 852 # Print stats file 853 with open(stats_file) as f: 854 stats = yaml.safe_load(f) 855 856 # Output 857 output_title = [] 858 output_index = [] 859 output = [] 860 861 # Title 862 output_title.append("# HOWARD Stats") 863 864 # Index 865 output_index.append("## Index") 866 867 # Process sections 868 for section in stats: 869 infos = stats.get(section) 870 section_link = "#" + section.lower().replace(" ", "-") 871 output.append(f"## {section}") 872 output_index.append(f"- [{section}]({section_link})") 873 874 if len(infos): 875 for info in infos: 876 try: 877 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 878 is_df = True 879 except: 880 try: 881 df = pd.DataFrame.from_dict( 882 json.loads((infos.get(info))), orient="index" 883 ) 884 is_df = True 885 except: 886 is_df = False 887 if is_df: 888 output.append(f"### {info}") 889 info_link = "#" + info.lower().replace(" ", "-") 890 output_index.append(f" - [{info}]({info_link})") 891 output.append(f"{df.to_markdown(index=False)}") 892 else: 893 output.append(f"- {info}: {infos.get(info)}") 894 else: 895 output.append(f"NA") 896 897 # Write stats in markdown file 898 with open(output_file, "w") as fp: 899 for item in output_title: 900 fp.write("%s\n" % item) 901 for item in output_index: 902 fp.write("%s\n" % item) 903 for item in output: 904 fp.write("%s\n" % item) 905 906 # Output stats in markdown 907 print("") 908 print("\n\n".join(output_title)) 909 print("") 910 print("\n\n".join(output)) 911 print("") 912 913 return None 914 915 def get_input(self) -> str: 916 """ 917 It returns the value of the input variable. 918 :return: The input is being returned. 919 """ 920 return self.input 921 922 def get_input_format(self, input_file: str = None) -> str: 923 """ 924 This function returns the format of the input variable, either from the provided input file or 925 by prompting for input. 926 927 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 928 represents the file path of the input file. If no `input_file` is provided when calling the 929 method, it will default to `None` 930 :type input_file: str 931 :return: The format of the input variable is being returned. 932 """ 933 934 if not input_file: 935 input_file = self.get_input() 936 input_format = get_file_format(input_file) 937 return input_format 938 939 def get_input_compressed(self, input_file: str = None) -> str: 940 """ 941 The function `get_input_compressed` returns the format of the input variable after compressing 942 it. 943 944 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 945 that represents the file path of the input file. If no `input_file` is provided when calling the 946 method, it will default to `None` and the method will then call `self.get_input()` to 947 :type input_file: str 948 :return: The function `get_input_compressed` returns the compressed format of the input 949 variable. 950 """ 951 952 if not input_file: 953 input_file = self.get_input() 954 input_compressed = get_file_compressed(input_file) 955 return input_compressed 956 957 def get_output(self) -> str: 958 """ 959 It returns the output of the neuron. 960 :return: The output of the neural network. 961 """ 962 963 return self.output 964 965 def get_output_format(self, output_file: str = None) -> str: 966 """ 967 The function `get_output_format` returns the format of the input variable or the output file if 968 provided. 969 970 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 971 that represents the file path of the output file. If no `output_file` is provided when calling 972 the method, it will default to the output obtained from the `get_output` method of the class 973 instance. The 974 :type output_file: str 975 :return: The format of the input variable is being returned. 976 """ 977 978 if not output_file: 979 output_file = self.get_output() 980 output_format = get_file_format(output_file) 981 982 return output_format 983 984 def get_config(self) -> dict: 985 """ 986 It returns the config 987 :return: The config variable is being returned. 988 """ 989 return self.config 990 991 def get_param(self) -> dict: 992 """ 993 It returns the param 994 :return: The param variable is being returned. 995 """ 996 return self.param 997 998 def get_connexion_db(self) -> str: 999 """ 1000 It returns the connexion_db attribute of the object 1001 :return: The connexion_db is being returned. 1002 """ 1003 return self.connexion_db 1004 1005 def get_prefix(self) -> str: 1006 """ 1007 It returns the prefix of the object. 1008 :return: The prefix is being returned. 1009 """ 1010 return self.prefix 1011 1012 def get_table_variants(self, clause: str = "select") -> str: 1013 """ 1014 This function returns the table_variants attribute of the object 1015 1016 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1017 defaults to select (optional) 1018 :return: The table_variants attribute of the object. 1019 """ 1020 1021 # Access 1022 access = self.get_config().get("access", None) 1023 1024 # Clauses "select", "where", "update" 1025 if clause in ["select", "where", "update"]: 1026 table_variants = self.table_variants 1027 # Clause "from" 1028 elif clause in ["from"]: 1029 # For Read Only 1030 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1031 input_file = self.get_input() 1032 table_variants = f"'{input_file}' as variants" 1033 # For Read Write 1034 else: 1035 table_variants = f"{self.table_variants} as variants" 1036 else: 1037 table_variants = self.table_variants 1038 return table_variants 1039 1040 def get_tmp_dir(self) -> str: 1041 """ 1042 The function `get_tmp_dir` returns the temporary directory path based on configuration 1043 parameters or a default path. 1044 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1045 configuration, parameters, and a default value of "/tmp". 1046 """ 1047 1048 return get_tmp( 1049 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1050 ) 1051 1052 def get_connexion_type(self) -> str: 1053 """ 1054 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1055 1056 :return: The connexion type is being returned. 1057 """ 1058 return self.get_config().get("connexion_type", "memory") 1059 1060 def get_connexion(self): 1061 """ 1062 It returns the connection object 1063 1064 :return: The connection object. 1065 """ 1066 return self.conn 1067 1068 def close_connexion(self) -> None: 1069 """ 1070 This function closes the connection to the database. 1071 :return: The connection is being closed. 1072 """ 1073 return self.conn.close() 1074 1075 def get_header(self, type: str = "vcf"): 1076 """ 1077 This function returns the header of the VCF file as a list of strings 1078 1079 :param type: the type of header you want to get, defaults to vcf (optional) 1080 :return: The header of the vcf file. 1081 """ 1082 1083 if self.header_vcf: 1084 if type == "vcf": 1085 return self.header_vcf 1086 elif type == "list": 1087 return self.header_list 1088 else: 1089 if type == "vcf": 1090 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1091 return header 1092 elif type == "list": 1093 return vcf_required 1094 1095 def get_header_length(self, file: str = None) -> int: 1096 """ 1097 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1098 line. 1099 1100 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1101 header file. If this argument is provided, the function will read the header from the specified 1102 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1103 :type file: str 1104 :return: the length of the header list, excluding the #CHROM line. 1105 """ 1106 1107 if file: 1108 return len(self.read_vcf_header_file(file=file)) - 1 1109 elif self.get_header(type="list"): 1110 return len(self.get_header(type="list")) - 1 1111 else: 1112 return 0 1113 1114 def get_header_columns(self) -> str: 1115 """ 1116 This function returns the header list of a VCF 1117 1118 :return: The length of the header list. 1119 """ 1120 if self.get_header(): 1121 return self.get_header(type="list")[-1] 1122 else: 1123 return "" 1124 1125 def get_header_columns_as_list(self) -> list: 1126 """ 1127 This function returns the header list of a VCF 1128 1129 :return: The length of the header list. 1130 """ 1131 if self.get_header(): 1132 return self.get_header_columns().strip().split("\t") 1133 else: 1134 return [] 1135 1136 def get_header_columns_as_sql(self) -> str: 1137 """ 1138 This function retruns header length (without #CHROM line) 1139 1140 :return: The length of the header list. 1141 """ 1142 sql_column_list = [] 1143 for col in self.get_header_columns_as_list(): 1144 sql_column_list.append(f'"{col}"') 1145 return ",".join(sql_column_list) 1146 1147 def get_header_sample_list( 1148 self, check: bool = False, samples: list = None, samples_force: bool = False 1149 ) -> list: 1150 """ 1151 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1152 checking and filtering based on input parameters. 1153 1154 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1155 parameter that determines whether to check if the samples in the list are properly defined as 1156 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1157 list is defined as a, defaults to False 1158 :type check: bool (optional) 1159 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1160 allows you to specify a subset of samples from the header. If you provide a list of sample 1161 names, the function will check if each sample is defined in the header. If a sample is not found 1162 in the 1163 :type samples: list 1164 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1165 a boolean parameter that determines whether to force the function to return the sample list 1166 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1167 function will return the sample list without performing, defaults to False 1168 :type samples_force: bool (optional) 1169 :return: The function `get_header_sample_list` returns a list of samples based on the input 1170 parameters and conditions specified in the function. 1171 """ 1172 1173 # Init 1174 samples_list = [] 1175 1176 if samples is None: 1177 samples_list = self.header_vcf.samples 1178 else: 1179 samples_checked = [] 1180 for sample in samples: 1181 if sample in self.header_vcf.samples: 1182 samples_checked.append(sample) 1183 else: 1184 log.warning(f"Sample '{sample}' not defined in header") 1185 samples_list = samples_checked 1186 1187 # Force sample list without checking if is_genotype_column 1188 if samples_force: 1189 log.warning(f"Samples {samples_list} not checked if genotypes") 1190 return samples_list 1191 1192 if check: 1193 samples_checked = [] 1194 for sample in samples_list: 1195 if self.is_genotype_column(column=sample): 1196 samples_checked.append(sample) 1197 else: 1198 log.warning( 1199 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1200 ) 1201 samples_list = samples_checked 1202 1203 # Return samples list 1204 return samples_list 1205 1206 def is_genotype_column(self, column: str = None) -> bool: 1207 """ 1208 This function checks if a given column is a genotype column in a database. 1209 1210 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1211 represents the column name in a database table. This method checks if the specified column is a 1212 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1213 method of 1214 :type column: str 1215 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1216 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1217 column name and returns the result. If the `column` parameter is None, it returns False. 1218 """ 1219 1220 if column is not None: 1221 return Database(database=self.get_input()).is_genotype_column(column=column) 1222 else: 1223 return False 1224 1225 def get_verbose(self) -> bool: 1226 """ 1227 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1228 exist 1229 1230 :return: The value of the key "verbose" in the config dictionary. 1231 """ 1232 return self.get_config().get("verbose", False) 1233 1234 def get_connexion_format(self) -> str: 1235 """ 1236 It returns the connexion format of the object. 1237 :return: The connexion_format is being returned. 1238 """ 1239 connexion_format = self.connexion_format 1240 if connexion_format not in ["duckdb", "sqlite"]: 1241 log.error(f"Unknown connexion format {connexion_format}") 1242 raise ValueError(f"Unknown connexion format {connexion_format}") 1243 else: 1244 return connexion_format 1245 1246 def insert_file_to_table( 1247 self, 1248 file, 1249 columns: str, 1250 header_len: int = 0, 1251 sep: str = "\t", 1252 chunksize: int = 1000000, 1253 ) -> None: 1254 """ 1255 The function reads a file in chunks and inserts each chunk into a table based on the specified 1256 database format. 1257 1258 :param file: The `file` parameter is the file that you want to load into a table. It should be 1259 the path to the file on your system 1260 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1261 should contain the names of the columns in the table where the data will be inserted. The column 1262 names should be separated by commas within the string. For example, if you have columns named 1263 "id", "name 1264 :type columns: str 1265 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1266 the number of lines to skip at the beginning of the file before reading the actual data. This 1267 parameter allows you to skip any header information present in the file before processing the 1268 data, defaults to 0 1269 :type header_len: int (optional) 1270 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1271 separator character that is used in the file being read. In this case, the default separator is 1272 set to `\t`, which represents a tab character. You can change this parameter to a different 1273 separator character if, defaults to \t 1274 :type sep: str (optional) 1275 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1276 when processing the file in chunks. In the provided code snippet, the default value for 1277 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1278 to 1000000 1279 :type chunksize: int (optional) 1280 """ 1281 1282 # Config 1283 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1284 connexion_format = self.get_connexion_format() 1285 1286 log.debug("chunksize: " + str(chunksize)) 1287 1288 if chunksize: 1289 for chunk in pd.read_csv( 1290 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1291 ): 1292 if connexion_format in ["duckdb"]: 1293 sql_insert_into = ( 1294 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1295 ) 1296 self.conn.execute(sql_insert_into) 1297 elif connexion_format in ["sqlite"]: 1298 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1299 1300 def load_data( 1301 self, 1302 input_file: str = None, 1303 drop_variants_table: bool = False, 1304 sample_size: int = 20480, 1305 ) -> None: 1306 """ 1307 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1308 table before loading the data and specify a sample size. 1309 1310 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1311 table 1312 :type input_file: str 1313 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1314 determines whether the variants table should be dropped before loading the data. If set to 1315 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1316 not be dropped, defaults to False 1317 :type drop_variants_table: bool (optional) 1318 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1319 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1320 20480 1321 :type sample_size: int (optional) 1322 """ 1323 1324 log.info("Loading...") 1325 1326 # change input file 1327 if input_file: 1328 self.set_input(input_file) 1329 self.set_header() 1330 1331 # drop variants table 1332 if drop_variants_table: 1333 self.drop_variants_table() 1334 1335 # get table variants 1336 table_variants = self.get_table_variants() 1337 1338 # Access 1339 access = self.get_config().get("access", None) 1340 log.debug(f"access: {access}") 1341 1342 # Input format and compress 1343 input_format = self.get_input_format() 1344 input_compressed = self.get_input_compressed() 1345 log.debug(f"input_format: {input_format}") 1346 log.debug(f"input_compressed: {input_compressed}") 1347 1348 # input_compressed_format 1349 if input_compressed: 1350 input_compressed_format = "gzip" 1351 else: 1352 input_compressed_format = "none" 1353 log.debug(f"input_compressed_format: {input_compressed_format}") 1354 1355 # Connexion format 1356 connexion_format = self.get_connexion_format() 1357 1358 # Sample size 1359 if not sample_size: 1360 sample_size = -1 1361 log.debug(f"sample_size: {sample_size}") 1362 1363 # Load data 1364 log.debug(f"Load Data from {input_format}") 1365 1366 # DuckDB connexion 1367 if connexion_format in ["duckdb"]: 1368 1369 # Database already exists 1370 if self.input_format in ["db", "duckdb"]: 1371 1372 if connexion_format in ["duckdb"]: 1373 log.debug(f"Input file format '{self.input_format}' duckDB") 1374 else: 1375 log.error( 1376 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1377 ) 1378 raise ValueError( 1379 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1380 ) 1381 1382 # Load from existing database format 1383 else: 1384 1385 try: 1386 # Create Table or View 1387 database = Database(database=self.input) 1388 sql_from = database.get_sql_from(sample_size=sample_size) 1389 1390 if access in ["RO"]: 1391 sql_load = ( 1392 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1393 ) 1394 else: 1395 sql_load = ( 1396 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1397 ) 1398 self.conn.execute(sql_load) 1399 1400 except: 1401 # Format not available 1402 log.error(f"Input file format '{self.input_format}' not available") 1403 raise ValueError( 1404 f"Input file format '{self.input_format}' not available" 1405 ) 1406 1407 # SQLite connexion 1408 elif connexion_format in ["sqlite"] and input_format in [ 1409 "vcf", 1410 "tsv", 1411 "csv", 1412 "psv", 1413 ]: 1414 1415 # Main structure 1416 structure = { 1417 "#CHROM": "VARCHAR", 1418 "POS": "INTEGER", 1419 "ID": "VARCHAR", 1420 "REF": "VARCHAR", 1421 "ALT": "VARCHAR", 1422 "QUAL": "VARCHAR", 1423 "FILTER": "VARCHAR", 1424 "INFO": "VARCHAR", 1425 } 1426 1427 # Strcuture with samples 1428 structure_complete = structure 1429 if self.get_header_sample_list(): 1430 structure["FORMAT"] = "VARCHAR" 1431 for sample in self.get_header_sample_list(): 1432 structure_complete[sample] = "VARCHAR" 1433 1434 # Columns list for create and insert 1435 sql_create_table_columns = [] 1436 sql_create_table_columns_list = [] 1437 for column in structure_complete: 1438 column_type = structure_complete[column] 1439 sql_create_table_columns.append( 1440 f'"{column}" {column_type} default NULL' 1441 ) 1442 sql_create_table_columns_list.append(f'"{column}"') 1443 1444 # Create database 1445 log.debug(f"Create Table {table_variants}") 1446 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1447 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1448 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1449 self.conn.execute(sql_create_table) 1450 1451 # chunksize define length of file chunk load file 1452 chunksize = 100000 1453 1454 # delimiter 1455 delimiter = file_format_delimiters.get(input_format, "\t") 1456 1457 # Load the input file 1458 with open(self.input, "rt") as input_file: 1459 1460 # Use the appropriate file handler based on the input format 1461 if input_compressed: 1462 input_file = bgzf.open(self.input, "rt") 1463 if input_format in ["vcf"]: 1464 header_len = self.get_header_length() 1465 else: 1466 header_len = 0 1467 1468 # Insert the file contents into a table 1469 self.insert_file_to_table( 1470 input_file, 1471 columns=sql_create_table_columns_list_sql, 1472 header_len=header_len, 1473 sep=delimiter, 1474 chunksize=chunksize, 1475 ) 1476 1477 else: 1478 log.error( 1479 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1480 ) 1481 raise ValueError( 1482 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1483 ) 1484 1485 # Explode INFOS fields into table fields 1486 if self.get_explode_infos(): 1487 self.explode_infos( 1488 prefix=self.get_explode_infos_prefix(), 1489 fields=self.get_explode_infos_fields(), 1490 force=True, 1491 ) 1492 1493 # Create index after insertion 1494 self.create_indexes() 1495 1496 def get_explode_infos(self) -> bool: 1497 """ 1498 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1499 to False if it is not set. 1500 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1501 value. If the parameter is not present, it will return False. 1502 """ 1503 1504 return self.get_param().get("explode", {}).get("explode_infos", False) 1505 1506 def get_explode_infos_fields( 1507 self, 1508 explode_infos_fields: str = None, 1509 remove_fields_not_in_header: bool = False, 1510 ) -> list: 1511 """ 1512 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1513 the input parameter `explode_infos_fields`. 1514 1515 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1516 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1517 comma-separated list of field names to explode 1518 :type explode_infos_fields: str 1519 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1520 flag that determines whether to remove fields that are not present in the header. If it is set 1521 to `True`, any field that is not in the header will be excluded from the list of exploded 1522 information fields. If it is set to `, defaults to False 1523 :type remove_fields_not_in_header: bool (optional) 1524 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1525 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1526 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1527 Otherwise, it returns a list of exploded information fields after removing any spaces and 1528 splitting the string by commas. 1529 """ 1530 1531 # If no fields, get it in param 1532 if not explode_infos_fields: 1533 explode_infos_fields = ( 1534 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1535 ) 1536 1537 # If no fields, defined as all fields in header using keyword 1538 if not explode_infos_fields: 1539 explode_infos_fields = "*" 1540 1541 # If fields list not empty 1542 if explode_infos_fields: 1543 1544 # Input fields list 1545 if isinstance(explode_infos_fields, str): 1546 fields_input = explode_infos_fields.split(",") 1547 elif isinstance(explode_infos_fields, list): 1548 fields_input = explode_infos_fields 1549 else: 1550 fields_input = [] 1551 1552 # Fields list without * keyword 1553 fields_without_all = fields_input.copy() 1554 if "*".casefold() in (item.casefold() for item in fields_without_all): 1555 fields_without_all.remove("*") 1556 1557 # Fields in header 1558 fields_in_header = sorted(list(set(self.get_header().infos))) 1559 1560 # Construct list of fields 1561 fields_output = [] 1562 for field in fields_input: 1563 1564 # Strip field 1565 field = field.strip() 1566 1567 # format keyword * in regex 1568 if field.upper() in ["*"]: 1569 field = ".*" 1570 1571 # Find all fields with pattern 1572 r = re.compile(field) 1573 fields_search = sorted(list(filter(r.match, fields_in_header))) 1574 1575 # Remove fields input from search 1576 if field in fields_search: 1577 fields_search = [field] 1578 elif fields_search != [field]: 1579 fields_search = sorted( 1580 list(set(fields_search).difference(fields_input)) 1581 ) 1582 1583 # If field is not in header (avoid not well formatted header) 1584 if not fields_search and not remove_fields_not_in_header: 1585 fields_search = [field] 1586 1587 # Add found fields 1588 for new_field in fields_search: 1589 # Add field, if not already exists, and if it is in header (if asked) 1590 if ( 1591 new_field not in fields_output 1592 and ( 1593 not remove_fields_not_in_header 1594 or new_field in fields_in_header 1595 ) 1596 and new_field not in [".*"] 1597 ): 1598 fields_output.append(new_field) 1599 1600 return fields_output 1601 1602 else: 1603 1604 return [] 1605 1606 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1607 """ 1608 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1609 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1610 not provided. 1611 1612 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1613 prefix to be used for exploding or expanding information 1614 :type explode_infos_prefix: str 1615 :return: the value of the variable `explode_infos_prefix`. 1616 """ 1617 1618 if not explode_infos_prefix: 1619 explode_infos_prefix = ( 1620 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1621 ) 1622 1623 return explode_infos_prefix 1624 1625 def add_column( 1626 self, 1627 table_name, 1628 column_name, 1629 column_type, 1630 default_value=None, 1631 drop: bool = False, 1632 ) -> dict: 1633 """ 1634 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1635 doesn't already exist. 1636 1637 :param table_name: The name of the table to which you want to add a column 1638 :param column_name: The parameter "column_name" is the name of the column that you want to add 1639 to the table 1640 :param column_type: The `column_type` parameter specifies the data type of the column that you 1641 want to add to the table. It should be a string that represents the desired data type, such as 1642 "INTEGER", "TEXT", "REAL", etc 1643 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1644 default value for the newly added column. If a default value is provided, it will be assigned to 1645 the column for any existing rows that do not have a value for that column 1646 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1647 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1648 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1649 to False 1650 :type drop: bool (optional) 1651 :return: a boolean value indicating whether the column was successfully added to the table. 1652 """ 1653 1654 # added 1655 added = False 1656 dropped = False 1657 1658 # Check if the column already exists in the table 1659 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1660 columns = self.get_query_to_df(query).columns.tolist() 1661 if column_name.upper() in [c.upper() for c in columns]: 1662 log.debug( 1663 f"The {column_name} column already exists in the {table_name} table" 1664 ) 1665 if drop: 1666 self.drop_column(table_name=table_name, column_name=column_name) 1667 dropped = True 1668 else: 1669 return None 1670 else: 1671 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1672 1673 # Add column in table 1674 add_column_query = ( 1675 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1676 ) 1677 if default_value is not None: 1678 add_column_query += f" DEFAULT {default_value}" 1679 self.execute_query(add_column_query) 1680 added = not dropped 1681 log.debug( 1682 f"The {column_name} column was successfully added to the {table_name} table" 1683 ) 1684 1685 if added: 1686 added_column = { 1687 "table_name": table_name, 1688 "column_name": column_name, 1689 "column_type": column_type, 1690 "default_value": default_value, 1691 } 1692 else: 1693 added_column = None 1694 1695 return added_column 1696 1697 def drop_column( 1698 self, column: dict = None, table_name: str = None, column_name: str = None 1699 ) -> bool: 1700 """ 1701 The `drop_column` function drops a specified column from a given table in a database and returns 1702 True if the column was successfully dropped, and False if the column does not exist in the 1703 table. 1704 1705 :param column: The `column` parameter is a dictionary that contains information about the column 1706 you want to drop. It has two keys: 1707 :type column: dict 1708 :param table_name: The `table_name` parameter is the name of the table from which you want to 1709 drop a column 1710 :type table_name: str 1711 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1712 from the table 1713 :type column_name: str 1714 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1715 and False if the column does not exist in the table. 1716 """ 1717 1718 # Find column infos 1719 if column: 1720 if isinstance(column, dict): 1721 table_name = column.get("table_name", None) 1722 column_name = column.get("column_name", None) 1723 elif isinstance(column, str): 1724 table_name = self.get_table_variants() 1725 column_name = column 1726 else: 1727 table_name = None 1728 column_name = None 1729 1730 if not table_name and not column_name: 1731 return False 1732 1733 # Removed 1734 removed = False 1735 1736 # Check if the column already exists in the table 1737 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1738 columns = self.get_query_to_df(query).columns.tolist() 1739 if column_name in columns: 1740 log.debug(f"The {column_name} column exists in the {table_name} table") 1741 else: 1742 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1743 return False 1744 1745 # Add column in table # ALTER TABLE integers DROP k 1746 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1747 self.execute_query(add_column_query) 1748 removed = True 1749 log.debug( 1750 f"The {column_name} column was successfully dropped to the {table_name} table" 1751 ) 1752 1753 return removed 1754 1755 def explode_infos( 1756 self, 1757 prefix: str = None, 1758 create_index: bool = False, 1759 fields: list = None, 1760 force: bool = False, 1761 proccess_all_fields_together: bool = False, 1762 table: str = None, 1763 ) -> list: 1764 """ 1765 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1766 individual columns, returning a list of added columns. 1767 1768 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1769 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1770 `self.get_explode_infos_prefix()` as the prefix 1771 :type prefix: str 1772 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1773 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1774 `False`, indexes will not be created. The default value is `False`, defaults to False 1775 :type create_index: bool (optional) 1776 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1777 that you want to explode into individual columns. If this parameter is not provided, all INFO 1778 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1779 a list to the ` 1780 :type fields: list 1781 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1782 determines whether to drop and recreate a column if it already exists in the table. If `force` 1783 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1784 defaults to False 1785 :type force: bool (optional) 1786 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1787 flag that determines whether to process all the INFO fields together or individually. If set to 1788 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1789 be processed individually. The default value is, defaults to False 1790 :type proccess_all_fields_together: bool (optional) 1791 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1792 of the table where the exploded INFO fields will be added as individual columns. If you provide 1793 a value for the `table` parameter, the function will use that table name. If the `table` 1794 parameter is 1795 :type table: str 1796 :return: The `explode_infos` function returns a list of added columns. 1797 """ 1798 1799 # drop indexes 1800 self.drop_indexes() 1801 1802 # connexion format 1803 connexion_format = self.get_connexion_format() 1804 1805 # Access 1806 access = self.get_config().get("access", None) 1807 1808 # Added columns 1809 added_columns = [] 1810 1811 if access not in ["RO"]: 1812 1813 # prefix 1814 if prefix in [None, True] or not isinstance(prefix, str): 1815 if self.get_explode_infos_prefix() not in [None, True]: 1816 prefix = self.get_explode_infos_prefix() 1817 else: 1818 prefix = "INFO/" 1819 1820 # table variants 1821 if table is not None: 1822 table_variants = table 1823 else: 1824 table_variants = self.get_table_variants(clause="select") 1825 1826 # extra infos 1827 try: 1828 extra_infos = self.get_extra_infos() 1829 except: 1830 extra_infos = [] 1831 1832 # Header infos 1833 header_infos = self.get_header().infos 1834 1835 log.debug( 1836 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1837 ) 1838 1839 sql_info_alter_table_array = [] 1840 1841 # Info fields to check 1842 fields_list = list(header_infos) 1843 if fields: 1844 fields_list += fields 1845 fields_list = set(fields_list) 1846 1847 # If no fields 1848 if not fields: 1849 fields = [] 1850 1851 # Translate fields if patterns 1852 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1853 1854 for info in fields: 1855 1856 info_id_sql = prefix + info 1857 1858 if ( 1859 info in fields_list 1860 or prefix + info in fields_list 1861 or info in extra_infos 1862 ): 1863 1864 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1865 1866 if info in header_infos: 1867 info_type = header_infos[info].type 1868 info_num = header_infos[info].num 1869 else: 1870 info_type = "String" 1871 info_num = 0 1872 1873 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1874 if info_num != 1: 1875 type_sql = "VARCHAR" 1876 1877 # Add field 1878 added_column = self.add_column( 1879 table_name=table_variants, 1880 column_name=info_id_sql, 1881 column_type=type_sql, 1882 default_value="null", 1883 drop=force, 1884 ) 1885 1886 if added_column: 1887 added_columns.append(added_column) 1888 1889 if added_column or force: 1890 1891 # add field to index 1892 self.index_additionnal_fields.append(info_id_sql) 1893 1894 # Update field array 1895 if connexion_format in ["duckdb"]: 1896 update_info_field = f""" 1897 "{info_id_sql}" = 1898 CASE 1899 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1900 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1901 END 1902 """ 1903 elif connexion_format in ["sqlite"]: 1904 update_info_field = f""" 1905 "{info_id_sql}" = 1906 CASE 1907 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1908 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1909 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1910 END 1911 """ 1912 1913 sql_info_alter_table_array.append(update_info_field) 1914 1915 if sql_info_alter_table_array: 1916 1917 # By chromosomes 1918 try: 1919 chromosomes_list = list( 1920 self.get_query_to_df( 1921 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1922 )["#CHROM"] 1923 ) 1924 except: 1925 chromosomes_list = [None] 1926 1927 for chrom in chromosomes_list: 1928 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1929 1930 # Where clause 1931 where_clause = "" 1932 if chrom and len(chromosomes_list) > 1: 1933 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1934 1935 # Update table 1936 if proccess_all_fields_together: 1937 sql_info_alter_table_array_join = ", ".join( 1938 sql_info_alter_table_array 1939 ) 1940 if sql_info_alter_table_array_join: 1941 sql_info_alter_table = f""" 1942 UPDATE {table_variants} 1943 SET {sql_info_alter_table_array_join} 1944 {where_clause} 1945 """ 1946 log.debug( 1947 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1948 ) 1949 # log.debug(sql_info_alter_table) 1950 self.conn.execute(sql_info_alter_table) 1951 else: 1952 sql_info_alter_num = 0 1953 for sql_info_alter in sql_info_alter_table_array: 1954 sql_info_alter_num += 1 1955 sql_info_alter_table = f""" 1956 UPDATE {table_variants} 1957 SET {sql_info_alter} 1958 {where_clause} 1959 """ 1960 log.debug( 1961 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1962 ) 1963 # log.debug(sql_info_alter_table) 1964 self.conn.execute(sql_info_alter_table) 1965 1966 # create indexes 1967 if create_index: 1968 self.create_indexes() 1969 1970 return added_columns 1971 1972 def create_indexes(self) -> None: 1973 """ 1974 Create indexes on the table after insertion 1975 """ 1976 1977 # Access 1978 access = self.get_config().get("access", None) 1979 1980 # get table variants 1981 table_variants = self.get_table_variants("FROM") 1982 1983 if self.get_indexing() and access not in ["RO"]: 1984 # Create index 1985 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1986 self.conn.execute(sql_create_table_index) 1987 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1988 self.conn.execute(sql_create_table_index) 1989 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1990 self.conn.execute(sql_create_table_index) 1991 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1992 self.conn.execute(sql_create_table_index) 1993 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1994 self.conn.execute(sql_create_table_index) 1995 for field in self.index_additionnal_fields: 1996 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1997 self.conn.execute(sql_create_table_index) 1998 1999 def drop_indexes(self) -> None: 2000 """ 2001 Create indexes on the table after insertion 2002 """ 2003 2004 # Access 2005 access = self.get_config().get("access", None) 2006 2007 # get table variants 2008 table_variants = self.get_table_variants("FROM") 2009 2010 # Get database format 2011 connexion_format = self.get_connexion_format() 2012 2013 if access not in ["RO"]: 2014 if connexion_format in ["duckdb"]: 2015 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2016 elif connexion_format in ["sqlite"]: 2017 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2018 2019 list_indexes = self.conn.execute(sql_list_indexes) 2020 index_names = [row[0] for row in list_indexes.fetchall()] 2021 for index in index_names: 2022 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2023 self.conn.execute(sql_drop_table_index) 2024 2025 def read_vcf_header(self, f) -> list: 2026 """ 2027 It reads the header of a VCF file and returns a list of the header lines 2028 2029 :param f: the file object 2030 :return: The header lines of the VCF file. 2031 """ 2032 2033 header_list = [] 2034 for line in f: 2035 header_list.append(line) 2036 if line.startswith("#CHROM"): 2037 break 2038 return header_list 2039 2040 def read_vcf_header_file(self, file: str = None) -> list: 2041 """ 2042 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2043 uncompressed files. 2044 2045 :param file: The `file` parameter is a string that represents the path to the VCF header file 2046 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2047 default to `None` 2048 :type file: str 2049 :return: The function `read_vcf_header_file` returns a list. 2050 """ 2051 2052 if self.get_input_compressed(input_file=file): 2053 with bgzf.open(file, "rt") as f: 2054 return self.read_vcf_header(f=f) 2055 else: 2056 with open(file, "rt") as f: 2057 return self.read_vcf_header(f=f) 2058 2059 def execute_query(self, query: str): 2060 """ 2061 It takes a query as an argument, executes it, and returns the results 2062 2063 :param query: The query to be executed 2064 :return: The result of the query is being returned. 2065 """ 2066 if query: 2067 return self.conn.execute(query) # .fetchall() 2068 else: 2069 return None 2070 2071 def export_output( 2072 self, 2073 output_file: str | None = None, 2074 output_header: str | None = None, 2075 export_header: bool = True, 2076 query: str | None = None, 2077 parquet_partitions: list | None = None, 2078 chunk_size: int | None = None, 2079 threads: int | None = None, 2080 sort: bool = False, 2081 index: bool = False, 2082 order_by: str | None = None, 2083 ) -> bool: 2084 """ 2085 The `export_output` function exports data from a VCF file to a specified output file in various 2086 formats, including VCF, CSV, TSV, PSV, and Parquet. 2087 2088 :param output_file: The `output_file` parameter is a string that specifies the name of the 2089 output file to be generated by the function. This is where the exported data will be saved 2090 :type output_file: str 2091 :param output_header: The `output_header` parameter is a string that specifies the name of the 2092 file where the header of the VCF file will be exported. If this parameter is not provided, the 2093 header will be exported to a file with the same name as the `output_file` parameter, but with 2094 the extension " 2095 :type output_header: str 2096 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2097 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2098 True, the header will be exported to a file. If `export_header` is False, the header will not 2099 be, defaults to True, if output format is not VCF 2100 :type export_header: bool (optional) 2101 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2102 select specific data from the VCF file before exporting it. If provided, only the data that 2103 matches the query will be exported 2104 :type query: str 2105 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2106 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2107 organize data in a hierarchical directory structure based on the values of one or more columns. 2108 This can improve query performance when working with large datasets 2109 :type parquet_partitions: list 2110 :param chunk_size: The `chunk_size` parameter specifies the number of 2111 records in batch when exporting data in Parquet format. This parameter is used for 2112 partitioning the Parquet file into multiple files. 2113 :type chunk_size: int 2114 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2115 threads to be used during the export process. It determines the level of parallelism and can 2116 improve the performance of the export operation. If not provided, the function will use the 2117 default number of threads 2118 :type threads: int 2119 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2120 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2121 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2122 False 2123 :type sort: bool (optional) 2124 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2125 created on the output file. If `index` is True, an index will be created. If `index` is False, 2126 no index will be created. The default value is False, defaults to False 2127 :type index: bool (optional) 2128 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2129 sorting the output file. This parameter is only applicable when exporting data in VCF format 2130 :type order_by: str 2131 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2132 None if it doesn't. 2133 """ 2134 2135 # Log 2136 log.info("Exporting...") 2137 2138 # Full path 2139 output_file = full_path(output_file) 2140 output_header = full_path(output_header) 2141 2142 # Config 2143 config = self.get_config() 2144 2145 # Param 2146 param = self.get_param() 2147 2148 # Tmp files to remove 2149 tmp_to_remove = [] 2150 2151 # If no output, get it 2152 if not output_file: 2153 output_file = self.get_output() 2154 2155 # If not threads 2156 if not threads: 2157 threads = self.get_threads() 2158 2159 # Auto header name with extension 2160 if export_header or output_header: 2161 if not output_header: 2162 output_header = f"{output_file}.hdr" 2163 # Export header 2164 self.export_header(output_file=output_file) 2165 2166 # Switch off export header if VCF output 2167 output_file_type = get_file_format(output_file) 2168 if output_file_type in ["vcf"]: 2169 export_header = False 2170 tmp_to_remove.append(output_header) 2171 2172 # Chunk size 2173 if not chunk_size: 2174 chunk_size = config.get("chunk_size", None) 2175 2176 # Parquet partition 2177 if not parquet_partitions: 2178 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2179 if parquet_partitions and isinstance(parquet_partitions, str): 2180 parquet_partitions = parquet_partitions.split(",") 2181 2182 # Order by 2183 if not order_by: 2184 order_by = param.get("export", {}).get("order_by", "") 2185 2186 # Header in output 2187 header_in_output = param.get("export", {}).get("include_header", False) 2188 2189 # Database 2190 database_source = self.get_connexion() 2191 2192 # Connexion format 2193 connexion_format = self.get_connexion_format() 2194 2195 # Explode infos 2196 if self.get_explode_infos(): 2197 self.explode_infos( 2198 prefix=self.get_explode_infos_prefix(), 2199 fields=self.get_explode_infos_fields(), 2200 force=False, 2201 ) 2202 2203 # if connexion_format in ["sqlite"] or query: 2204 if connexion_format in ["sqlite"]: 2205 2206 # Export in Parquet 2207 random_tmp = "".join( 2208 random.choice(string.ascii_lowercase) for i in range(10) 2209 ) 2210 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2211 tmp_to_remove.append(database_source) 2212 2213 # Table Variants 2214 table_variants = self.get_table_variants() 2215 2216 # Create export query 2217 sql_query_export_subquery = f""" 2218 SELECT * FROM {table_variants} 2219 """ 2220 2221 # Write source file 2222 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2223 2224 # Create database 2225 database = Database( 2226 database=database_source, 2227 table="variants", 2228 header_file=output_header, 2229 conn_config=self.get_connexion_config(), 2230 ) 2231 2232 # Existing colomns header 2233 existing_columns_header = database.get_header_columns_from_database() 2234 2235 # Sample list 2236 get_samples = self.get_samples() 2237 get_samples_check = self.get_samples_check() 2238 samples_force = get_samples is not None 2239 sample_list = self.get_header_sample_list( 2240 check=get_samples_check, samples=get_samples, samples_force=samples_force 2241 ) 2242 2243 # Export file 2244 database.export( 2245 output_database=output_file, 2246 output_header=output_header, 2247 existing_columns_header=existing_columns_header, 2248 parquet_partitions=parquet_partitions, 2249 chunk_size=chunk_size, 2250 threads=threads, 2251 sort=sort, 2252 index=index, 2253 header_in_output=header_in_output, 2254 order_by=order_by, 2255 query=query, 2256 export_header=export_header, 2257 sample_list=sample_list, 2258 ) 2259 2260 # Remove 2261 remove_if_exists(tmp_to_remove) 2262 2263 return (os.path.exists(output_file) or None) and ( 2264 os.path.exists(output_file) or None 2265 ) 2266 2267 def get_extra_infos(self, table: str = None) -> list: 2268 """ 2269 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2270 in the header. 2271 2272 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2273 name of the table from which you want to retrieve the extra columns that are not present in the 2274 header. If the `table` parameter is not provided when calling the function, it will default to 2275 using the variants 2276 :type table: str 2277 :return: A list of columns that are in the specified table but not in the header of the table. 2278 """ 2279 2280 header_columns = [] 2281 2282 if not table: 2283 table = self.get_table_variants(clause="from") 2284 header_columns = self.get_header_columns() 2285 2286 # Check all columns in the database 2287 query = f""" SELECT * FROM {table} LIMIT 1 """ 2288 log.debug(f"query {query}") 2289 table_columns = self.get_query_to_df(query).columns.tolist() 2290 extra_columns = [] 2291 2292 # Construct extra infos (not in header) 2293 for column in table_columns: 2294 if column not in header_columns: 2295 extra_columns.append(column) 2296 2297 return extra_columns 2298 2299 def get_extra_infos_sql(self, table: str = None) -> str: 2300 """ 2301 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2302 by double quotes 2303 2304 :param table: The name of the table to get the extra infos from. If None, the default table is 2305 used 2306 :type table: str 2307 :return: A string of the extra infos 2308 """ 2309 2310 return ", ".join( 2311 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2312 ) 2313 2314 def export_header( 2315 self, 2316 header_name: str = None, 2317 output_file: str = None, 2318 output_file_ext: str = ".hdr", 2319 clean_header: bool = True, 2320 remove_chrom_line: bool = False, 2321 ) -> str: 2322 """ 2323 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2324 specified options, and writes it to a new file. 2325 2326 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2327 this parameter is not specified, the header will be written to the output file 2328 :type header_name: str 2329 :param output_file: The `output_file` parameter in the `export_header` function is used to 2330 specify the name of the output file where the header will be written. If this parameter is not 2331 provided, the header will be written to a temporary file 2332 :type output_file: str 2333 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2334 string that represents the extension of the output header file. By default, it is set to ".hdr" 2335 if not specified by the user. This extension will be appended to the `output_file` name to 2336 create the final, defaults to .hdr 2337 :type output_file_ext: str (optional) 2338 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2339 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2340 `True`, the function will clean the header by modifying certain lines based on a specific 2341 pattern. If `clean_header`, defaults to True 2342 :type clean_header: bool (optional) 2343 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2344 boolean flag that determines whether the #CHROM line should be removed from the header before 2345 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2346 defaults to False 2347 :type remove_chrom_line: bool (optional) 2348 :return: The function `export_header` returns the name of the temporary header file that is 2349 created. 2350 """ 2351 2352 if not header_name and not output_file: 2353 output_file = self.get_output() 2354 2355 if self.get_header(): 2356 2357 # Get header object 2358 header_obj = self.get_header() 2359 2360 # Create database 2361 db_for_header = Database(database=self.get_input()) 2362 2363 # Get real columns in the file 2364 db_header_columns = db_for_header.get_columns() 2365 2366 with tempfile.TemporaryDirectory() as tmpdir: 2367 2368 # Write header file 2369 header_file_tmp = os.path.join(tmpdir, "header") 2370 f = open(header_file_tmp, "w") 2371 vcf.Writer(f, header_obj) 2372 f.close() 2373 2374 # Replace #CHROM line with rel columns 2375 header_list = db_for_header.read_header_file( 2376 header_file=header_file_tmp 2377 ) 2378 header_list[-1] = "\t".join(db_header_columns) 2379 2380 # Remove CHROM line 2381 if remove_chrom_line: 2382 header_list.pop() 2383 2384 # Clean header 2385 if clean_header: 2386 header_list_clean = [] 2387 for head in header_list: 2388 # Clean head for malformed header 2389 head_clean = head 2390 head_clean = re.subn( 2391 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2392 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2393 head_clean, 2394 2, 2395 )[0] 2396 # Write header 2397 header_list_clean.append(head_clean) 2398 header_list = header_list_clean 2399 2400 tmp_header_name = output_file + output_file_ext 2401 2402 f = open(tmp_header_name, "w") 2403 for line in header_list: 2404 f.write(line) 2405 f.close() 2406 2407 return tmp_header_name 2408 2409 def export_variant_vcf( 2410 self, 2411 vcf_file, 2412 remove_info: bool = False, 2413 add_samples: bool = True, 2414 list_samples: list = [], 2415 where_clause: str = "", 2416 index: bool = False, 2417 threads: int | None = None, 2418 ) -> bool | None: 2419 """ 2420 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2421 remove INFO field, add samples, and control compression and indexing. 2422 2423 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2424 written to. It is the output file that will contain the filtered VCF data based on the specified 2425 parameters 2426 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2427 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2428 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2429 in, defaults to False 2430 :type remove_info: bool (optional) 2431 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2432 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2433 If set to False, the samples will be removed. The default value is True, defaults to True 2434 :type add_samples: bool (optional) 2435 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2436 in the output VCF file. By default, all samples will be included. If you provide a list of 2437 samples, only those samples will be included in the output file 2438 :type list_samples: list 2439 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2440 determines whether or not to create an index for the output VCF file. If `index` is set to 2441 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2442 :type index: bool (optional) 2443 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2444 number of threads to use for exporting the VCF file. It determines how many parallel threads 2445 will be used during the export process. More threads can potentially speed up the export process 2446 by utilizing multiple cores of the processor. If 2447 :type threads: int | None 2448 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2449 method with various parameters including the output file, query, threads, sort flag, and index 2450 flag. The `export_output` method is responsible for exporting the VCF data based on the 2451 specified parameters and configurations provided in the `export_variant_vcf` function. 2452 """ 2453 2454 # Config 2455 config = self.get_config() 2456 2457 # Extract VCF 2458 log.debug("Export VCF...") 2459 2460 # Table variants 2461 table_variants = self.get_table_variants() 2462 2463 # Threads 2464 if not threads: 2465 threads = self.get_threads() 2466 2467 # Info fields 2468 if remove_info: 2469 if not isinstance(remove_info, str): 2470 remove_info = "." 2471 info_field = f"""'{remove_info}' as INFO""" 2472 else: 2473 info_field = "INFO" 2474 2475 # Samples fields 2476 if add_samples: 2477 if not list_samples: 2478 list_samples = self.get_header_sample_list() 2479 if list_samples: 2480 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2481 else: 2482 samples_fields = "" 2483 log.debug(f"samples_fields: {samples_fields}") 2484 else: 2485 samples_fields = "" 2486 2487 # Where clause 2488 if where_clause is None: 2489 where_clause = "" 2490 2491 # Variants 2492 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2493 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2494 log.debug(f"sql_query_select={sql_query_select}") 2495 2496 return self.export_output( 2497 output_file=vcf_file, 2498 output_header=None, 2499 export_header=True, 2500 query=sql_query_select, 2501 parquet_partitions=None, 2502 chunk_size=config.get("chunk_size", None), 2503 threads=threads, 2504 sort=True, 2505 index=index, 2506 order_by=None, 2507 ) 2508 2509 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2510 """ 2511 It takes a list of commands and runs them in parallel using the number of threads specified 2512 2513 :param commands: A list of commands to run 2514 :param threads: The number of threads to use, defaults to 1 (optional) 2515 """ 2516 2517 run_parallel_commands(commands, threads) 2518 2519 def get_threads(self, default: int = 1) -> int: 2520 """ 2521 This function returns the number of threads to use for a job, with a default value of 1 if not 2522 specified. 2523 2524 :param default: The `default` parameter in the `get_threads` method is used to specify the 2525 default number of threads to use if no specific value is provided. If no value is provided for 2526 the `threads` parameter in the configuration or input parameters, the `default` value will be 2527 used, defaults to 1 2528 :type default: int (optional) 2529 :return: the number of threads to use for the current job. 2530 """ 2531 2532 # Config 2533 config = self.get_config() 2534 2535 # Param 2536 param = self.get_param() 2537 2538 # Input threads 2539 input_thread = param.get("threads", config.get("threads", None)) 2540 2541 # Check threads 2542 if not input_thread: 2543 threads = default 2544 elif int(input_thread) <= 0: 2545 threads = os.cpu_count() 2546 else: 2547 threads = int(input_thread) 2548 return threads 2549 2550 def get_memory(self, default: str = None) -> str: 2551 """ 2552 This function retrieves the memory value from parameters or configuration with a default value 2553 if not found. 2554 2555 :param default: The `get_memory` function takes in a default value as a string parameter. This 2556 default value is used as a fallback in case the `memory` parameter is not provided in the 2557 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2558 the function 2559 :type default: str 2560 :return: The `get_memory` function returns a string value representing the memory parameter. If 2561 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2562 return the default value provided as an argument to the function. 2563 """ 2564 2565 # Config 2566 config = self.get_config() 2567 2568 # Param 2569 param = self.get_param() 2570 2571 # Input threads 2572 input_memory = param.get("memory", config.get("memory", None)) 2573 2574 # Check threads 2575 if input_memory: 2576 memory = input_memory 2577 else: 2578 memory = default 2579 2580 return memory 2581 2582 def update_from_vcf(self, vcf_file: str) -> None: 2583 """ 2584 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2585 2586 :param vcf_file: the path to the VCF file 2587 """ 2588 2589 connexion_format = self.get_connexion_format() 2590 2591 if connexion_format in ["duckdb"]: 2592 self.update_from_vcf_duckdb(vcf_file) 2593 elif connexion_format in ["sqlite"]: 2594 self.update_from_vcf_sqlite(vcf_file) 2595 2596 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2597 """ 2598 It takes a VCF file and updates the INFO column of the variants table in the database with the 2599 INFO column of the VCF file 2600 2601 :param vcf_file: the path to the VCF file 2602 """ 2603 2604 # varaints table 2605 table_variants = self.get_table_variants() 2606 2607 # Loading VCF into temporaire table 2608 skip = self.get_header_length(file=vcf_file) 2609 vcf_df = pd.read_csv( 2610 vcf_file, 2611 sep="\t", 2612 engine="c", 2613 skiprows=skip, 2614 header=0, 2615 low_memory=False, 2616 ) 2617 sql_query_update = f""" 2618 UPDATE {table_variants} as table_variants 2619 SET INFO = concat( 2620 CASE 2621 WHEN INFO NOT IN ('', '.') 2622 THEN INFO 2623 ELSE '' 2624 END, 2625 ( 2626 SELECT 2627 concat( 2628 CASE 2629 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2630 THEN ';' 2631 ELSE '' 2632 END 2633 , 2634 CASE 2635 WHEN table_parquet.INFO NOT IN ('','.') 2636 THEN table_parquet.INFO 2637 ELSE '' 2638 END 2639 ) 2640 FROM vcf_df as table_parquet 2641 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2642 AND table_parquet.\"POS\" = table_variants.\"POS\" 2643 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2644 AND table_parquet.\"REF\" = table_variants.\"REF\" 2645 AND table_parquet.INFO NOT IN ('','.') 2646 ) 2647 ) 2648 ; 2649 """ 2650 self.conn.execute(sql_query_update) 2651 2652 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2653 """ 2654 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2655 table, then updates the INFO column of the variants table with the INFO column of the temporary 2656 table 2657 2658 :param vcf_file: The path to the VCF file you want to update the database with 2659 """ 2660 2661 # Create a temporary table for the VCF 2662 table_vcf = "tmp_vcf" 2663 sql_create = ( 2664 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2665 ) 2666 self.conn.execute(sql_create) 2667 2668 # Loading VCF into temporaire table 2669 vcf_df = pd.read_csv( 2670 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2671 ) 2672 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2673 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2674 2675 # Update table 'variants' with VCF data 2676 # warning: CONCAT as || operator 2677 sql_query_update = f""" 2678 UPDATE variants as table_variants 2679 SET INFO = CASE 2680 WHEN INFO NOT IN ('', '.') 2681 THEN INFO 2682 ELSE '' 2683 END || 2684 ( 2685 SELECT 2686 CASE 2687 WHEN table_variants.INFO NOT IN ('','.') 2688 AND table_vcf.INFO NOT IN ('','.') 2689 THEN ';' 2690 ELSE '' 2691 END || 2692 CASE 2693 WHEN table_vcf.INFO NOT IN ('','.') 2694 THEN table_vcf.INFO 2695 ELSE '' 2696 END 2697 FROM {table_vcf} as table_vcf 2698 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2699 AND table_vcf.\"POS\" = table_variants.\"POS\" 2700 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2701 AND table_vcf.\"REF\" = table_variants.\"REF\" 2702 ) 2703 """ 2704 self.conn.execute(sql_query_update) 2705 2706 # Drop temporary table 2707 sql_drop = f"DROP TABLE {table_vcf}" 2708 self.conn.execute(sql_drop) 2709 2710 def drop_variants_table(self) -> None: 2711 """ 2712 > This function drops the variants table 2713 """ 2714 2715 table_variants = self.get_table_variants() 2716 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2717 self.conn.execute(sql_table_variants) 2718 2719 def set_variant_id( 2720 self, variant_id_column: str = "variant_id", force: bool = None 2721 ) -> str: 2722 """ 2723 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2724 `#CHROM`, `POS`, `REF`, and `ALT` columns 2725 2726 :param variant_id_column: The name of the column to be created in the variants table, defaults 2727 to variant_id 2728 :type variant_id_column: str (optional) 2729 :param force: If True, the variant_id column will be created even if it already exists 2730 :type force: bool 2731 :return: The name of the column that contains the variant_id 2732 """ 2733 2734 # Assembly 2735 assembly = self.get_param().get( 2736 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2737 ) 2738 2739 # INFO/Tag prefix 2740 prefix = self.get_explode_infos_prefix() 2741 2742 # Explode INFO/SVTYPE 2743 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2744 2745 # variants table 2746 table_variants = self.get_table_variants() 2747 2748 # variant_id column 2749 if not variant_id_column: 2750 variant_id_column = "variant_id" 2751 2752 # Creta variant_id column 2753 if "variant_id" not in self.get_extra_infos() or force: 2754 2755 # Create column 2756 self.add_column( 2757 table_name=table_variants, 2758 column_name=variant_id_column, 2759 column_type="UBIGINT", 2760 default_value="0", 2761 ) 2762 2763 # Update column 2764 self.conn.execute( 2765 f""" 2766 UPDATE {table_variants} 2767 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2768 """ 2769 ) 2770 2771 # Remove added columns 2772 for added_column in added_columns: 2773 self.drop_column(column=added_column) 2774 2775 # return variant_id column name 2776 return variant_id_column 2777 2778 def get_variant_id_column( 2779 self, variant_id_column: str = "variant_id", force: bool = None 2780 ) -> str: 2781 """ 2782 This function returns the variant_id column name 2783 2784 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2785 defaults to variant_id 2786 :type variant_id_column: str (optional) 2787 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2788 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2789 if it is not already set, or if it is set 2790 :type force: bool 2791 :return: The variant_id column name. 2792 """ 2793 2794 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2795 2796 ### 2797 # Annotation 2798 ### 2799 2800 def scan_databases( 2801 self, 2802 database_formats: list = ["parquet"], 2803 database_releases: list = ["current"], 2804 ) -> dict: 2805 """ 2806 The function `scan_databases` scans for available databases based on specified formats and 2807 releases. 2808 2809 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2810 of the databases to be scanned. In this case, the accepted format is "parquet" 2811 :type database_formats: list ["parquet"] 2812 :param database_releases: The `database_releases` parameter is a list that specifies the 2813 releases of the databases to be scanned. In the provided function, the default value for 2814 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2815 databases that are in the "current" 2816 :type database_releases: list 2817 :return: The function `scan_databases` returns a dictionary containing information about 2818 databases that match the specified formats and releases. 2819 """ 2820 2821 # Config 2822 config = self.get_config() 2823 2824 # Param 2825 param = self.get_param() 2826 2827 # Param - Assembly 2828 assembly = param.get("assembly", config.get("assembly", None)) 2829 if not assembly: 2830 assembly = DEFAULT_ASSEMBLY 2831 log.warning(f"Default assembly '{assembly}'") 2832 2833 # Scan for availabled databases 2834 log.info( 2835 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2836 ) 2837 databases_infos_dict = databases_infos( 2838 database_folder_releases=database_releases, 2839 database_formats=database_formats, 2840 assembly=assembly, 2841 config=config, 2842 ) 2843 log.info( 2844 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2845 ) 2846 2847 return databases_infos_dict 2848 2849 def annotation(self) -> None: 2850 """ 2851 It annotates the VCF file with the annotations specified in the config file. 2852 """ 2853 2854 # Config 2855 config = self.get_config() 2856 2857 # Param 2858 param = self.get_param() 2859 2860 # Param - Assembly 2861 assembly = param.get("assembly", config.get("assembly", None)) 2862 if not assembly: 2863 assembly = DEFAULT_ASSEMBLY 2864 log.warning(f"Default assembly '{assembly}'") 2865 2866 # annotations databases folders 2867 annotations_databases = set( 2868 config.get("folders", {}) 2869 .get("databases", {}) 2870 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2871 + config.get("folders", {}) 2872 .get("databases", {}) 2873 .get("parquet", ["~/howard/databases/parquet/current"]) 2874 + config.get("folders", {}) 2875 .get("databases", {}) 2876 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2877 ) 2878 2879 # Get param annotations 2880 if param.get("annotations", None) and isinstance( 2881 param.get("annotations", None), str 2882 ): 2883 log.debug(param.get("annotations", None)) 2884 param_annotation_list = param.get("annotations").split(",") 2885 else: 2886 param_annotation_list = [] 2887 2888 # Each tools param 2889 if param.get("annotation_parquet", None) != None: 2890 log.debug( 2891 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2892 ) 2893 if isinstance(param.get("annotation_parquet", None), list): 2894 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2895 else: 2896 param_annotation_list.append(param.get("annotation_parquet")) 2897 if param.get("annotation_snpsift", None) != None: 2898 if isinstance(param.get("annotation_snpsift", None), list): 2899 param_annotation_list.append( 2900 "snpsift:" 2901 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2902 ) 2903 else: 2904 param_annotation_list.append( 2905 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2906 ) 2907 if param.get("annotation_snpeff", None) != None: 2908 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2909 if param.get("annotation_bcftools", None) != None: 2910 if isinstance(param.get("annotation_bcftools", None), list): 2911 param_annotation_list.append( 2912 "bcftools:" 2913 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2914 ) 2915 else: 2916 param_annotation_list.append( 2917 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2918 ) 2919 if param.get("annotation_annovar", None) != None: 2920 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2921 if param.get("annotation_exomiser", None) != None: 2922 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2923 if param.get("annotation_splice", None) != None: 2924 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2925 2926 # Merge param annotations list 2927 param["annotations"] = ",".join(param_annotation_list) 2928 2929 # debug 2930 log.debug(f"param_annotations={param['annotations']}") 2931 2932 if param.get("annotations"): 2933 2934 # Log 2935 # log.info("Annotations - Check annotation parameters") 2936 2937 if not "annotation" in param: 2938 param["annotation"] = {} 2939 2940 # List of annotations parameters 2941 annotations_list_input = {} 2942 if isinstance(param.get("annotations", None), str): 2943 annotation_file_list = [ 2944 value for value in param.get("annotations", "").split(",") 2945 ] 2946 for annotation_file in annotation_file_list: 2947 annotations_list_input[annotation_file] = {"INFO": None} 2948 else: 2949 annotations_list_input = param.get("annotations", {}) 2950 2951 log.info(f"Quick Annotations:") 2952 for annotation_key in list(annotations_list_input.keys()): 2953 log.info(f" {annotation_key}") 2954 2955 # List of annotations and associated fields 2956 annotations_list = {} 2957 2958 for annotation_file in annotations_list_input: 2959 2960 # Explode annotations if ALL 2961 if ( 2962 annotation_file.upper() == "ALL" 2963 or annotation_file.upper().startswith("ALL:") 2964 ): 2965 2966 # check ALL parameters (formats, releases) 2967 annotation_file_split = annotation_file.split(":") 2968 database_formats = "parquet" 2969 database_releases = "current" 2970 for annotation_file_option in annotation_file_split[1:]: 2971 database_all_options_split = annotation_file_option.split("=") 2972 if database_all_options_split[0] == "format": 2973 database_formats = database_all_options_split[1].split("+") 2974 if database_all_options_split[0] == "release": 2975 database_releases = database_all_options_split[1].split("+") 2976 2977 # Scan for availabled databases 2978 databases_infos_dict = self.scan_databases( 2979 database_formats=database_formats, 2980 database_releases=database_releases, 2981 ) 2982 2983 # Add found databases in annotation parameters 2984 for database_infos in databases_infos_dict.keys(): 2985 annotations_list[database_infos] = {"INFO": None} 2986 2987 else: 2988 annotations_list[annotation_file] = annotations_list_input[ 2989 annotation_file 2990 ] 2991 2992 # Check each databases 2993 if len(annotations_list): 2994 2995 log.info( 2996 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2997 ) 2998 2999 for annotation_file in annotations_list: 3000 3001 # Init 3002 annotations = annotations_list.get(annotation_file, None) 3003 3004 # Annotation snpEff 3005 if annotation_file.startswith("snpeff"): 3006 3007 log.debug(f"Quick Annotation snpEff") 3008 3009 if "snpeff" not in param["annotation"]: 3010 param["annotation"]["snpeff"] = {} 3011 3012 if "options" not in param["annotation"]["snpeff"]: 3013 param["annotation"]["snpeff"]["options"] = "" 3014 3015 # snpEff options in annotations 3016 param["annotation"]["snpeff"]["options"] = "".join( 3017 annotation_file.split(":")[1:] 3018 ) 3019 3020 # Annotation Annovar 3021 elif annotation_file.startswith("annovar"): 3022 3023 log.debug(f"Quick Annotation Annovar") 3024 3025 if "annovar" not in param["annotation"]: 3026 param["annotation"]["annovar"] = {} 3027 3028 if "annotations" not in param["annotation"]["annovar"]: 3029 param["annotation"]["annovar"]["annotations"] = {} 3030 3031 # Options 3032 annotation_file_split = annotation_file.split(":") 3033 for annotation_file_annotation in annotation_file_split[1:]: 3034 if annotation_file_annotation: 3035 param["annotation"]["annovar"]["annotations"][ 3036 annotation_file_annotation 3037 ] = annotations 3038 3039 # Annotation Exomiser 3040 elif annotation_file.startswith("exomiser"): 3041 3042 log.debug(f"Quick Annotation Exomiser") 3043 3044 param["annotation"]["exomiser"] = params_string_to_dict( 3045 annotation_file 3046 ) 3047 3048 # Annotation Splice 3049 elif annotation_file.startswith("splice"): 3050 3051 log.debug(f"Quick Annotation Splice") 3052 3053 param["annotation"]["splice"] = params_string_to_dict( 3054 annotation_file 3055 ) 3056 3057 # Annotation Parquet or BCFTOOLS 3058 else: 3059 3060 # Tools detection 3061 if annotation_file.startswith("bcftools:"): 3062 annotation_tool_initial = "bcftools" 3063 annotation_file = ":".join(annotation_file.split(":")[1:]) 3064 elif annotation_file.startswith("snpsift:"): 3065 annotation_tool_initial = "snpsift" 3066 annotation_file = ":".join(annotation_file.split(":")[1:]) 3067 else: 3068 annotation_tool_initial = None 3069 3070 # list of files 3071 annotation_file_list = annotation_file.replace("+", ":").split( 3072 ":" 3073 ) 3074 3075 for annotation_file in annotation_file_list: 3076 3077 if annotation_file: 3078 3079 # Annotation tool initial 3080 annotation_tool = annotation_tool_initial 3081 3082 # Find file 3083 annotation_file_found = None 3084 3085 # Expand user 3086 annotation_file = full_path(annotation_file) 3087 3088 if os.path.exists(annotation_file): 3089 annotation_file_found = annotation_file 3090 3091 else: 3092 # Find within assembly folders 3093 for annotations_database in annotations_databases: 3094 found_files = find_all( 3095 annotation_file, 3096 os.path.join( 3097 annotations_database, assembly 3098 ), 3099 ) 3100 if len(found_files) > 0: 3101 annotation_file_found = found_files[0] 3102 break 3103 if not annotation_file_found and not assembly: 3104 # Find within folders 3105 for ( 3106 annotations_database 3107 ) in annotations_databases: 3108 found_files = find_all( 3109 annotation_file, annotations_database 3110 ) 3111 if len(found_files) > 0: 3112 annotation_file_found = found_files[0] 3113 break 3114 log.debug( 3115 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3116 ) 3117 3118 # Full path 3119 annotation_file_found = full_path(annotation_file_found) 3120 3121 if annotation_file_found: 3122 3123 database = Database(database=annotation_file_found) 3124 quick_annotation_format = database.get_format() 3125 quick_annotation_is_compressed = ( 3126 database.is_compressed() 3127 ) 3128 quick_annotation_is_indexed = os.path.exists( 3129 f"{annotation_file_found}.tbi" 3130 ) 3131 bcftools_preference = False 3132 3133 # Check Annotation Tool 3134 if not annotation_tool: 3135 if ( 3136 bcftools_preference 3137 and quick_annotation_format 3138 in ["vcf", "bed"] 3139 and quick_annotation_is_compressed 3140 and quick_annotation_is_indexed 3141 ): 3142 annotation_tool = "bcftools" 3143 elif quick_annotation_format in [ 3144 "vcf", 3145 "bed", 3146 "tsv", 3147 "tsv", 3148 "csv", 3149 "json", 3150 "tbl", 3151 "parquet", 3152 "duckdb", 3153 ]: 3154 annotation_tool = "parquet" 3155 else: 3156 log.error( 3157 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3158 ) 3159 raise ValueError( 3160 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3161 ) 3162 3163 log.debug( 3164 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3165 ) 3166 3167 # Annotation Tool dispatch 3168 if annotation_tool: 3169 if annotation_tool not in param["annotation"]: 3170 param["annotation"][annotation_tool] = {} 3171 if ( 3172 "annotations" 3173 not in param["annotation"][annotation_tool] 3174 ): 3175 param["annotation"][annotation_tool][ 3176 "annotations" 3177 ] = {} 3178 param["annotation"][annotation_tool][ 3179 "annotations" 3180 ][annotation_file_found] = annotations 3181 3182 else: 3183 log.error( 3184 f"Quick Annotation File {annotation_file} does NOT exist" 3185 ) 3186 3187 self.set_param(param) 3188 3189 if param.get("annotation", None): 3190 log.info("Annotations") 3191 if param.get("annotation", {}).get("parquet", None): 3192 log.info("Annotations 'parquet'...") 3193 self.annotation_parquet() 3194 if param.get("annotation", {}).get("bcftools", None): 3195 log.info("Annotations 'bcftools'...") 3196 self.annotation_bcftools() 3197 if param.get("annotation", {}).get("snpsift", None): 3198 log.info("Annotations 'snpsift'...") 3199 self.annotation_snpsift() 3200 if param.get("annotation", {}).get("annovar", None): 3201 log.info("Annotations 'annovar'...") 3202 self.annotation_annovar() 3203 if param.get("annotation", {}).get("snpeff", None): 3204 log.info("Annotations 'snpeff'...") 3205 self.annotation_snpeff() 3206 if param.get("annotation", {}).get("exomiser", None) is not None: 3207 log.info("Annotations 'exomiser'...") 3208 self.annotation_exomiser() 3209 if param.get("annotation", {}).get("splice", None) is not None: 3210 log.info("Annotations 'splice' ...") 3211 self.annotation_splice() 3212 3213 # Explode INFOS fields into table fields 3214 if self.get_explode_infos(): 3215 self.explode_infos( 3216 prefix=self.get_explode_infos_prefix(), 3217 fields=self.get_explode_infos_fields(), 3218 force=True, 3219 ) 3220 3221 def annotation_snpsift(self, threads: int = None) -> None: 3222 """ 3223 This function annotate with bcftools 3224 3225 :param threads: Number of threads to use 3226 :return: the value of the variable "return_value". 3227 """ 3228 3229 # DEBUG 3230 log.debug("Start annotation with bcftools databases") 3231 3232 # Threads 3233 if not threads: 3234 threads = self.get_threads() 3235 log.debug("Threads: " + str(threads)) 3236 3237 # Config 3238 config = self.get_config() 3239 log.debug("Config: " + str(config)) 3240 3241 # Config - snpSift 3242 snpsift_bin_command = get_bin_command( 3243 bin="SnpSift.jar", 3244 tool="snpsift", 3245 bin_type="jar", 3246 config=config, 3247 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3248 ) 3249 if not snpsift_bin_command: 3250 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3251 log.error(msg_err) 3252 raise ValueError(msg_err) 3253 3254 # Config - bcftools 3255 bcftools_bin_command = get_bin_command( 3256 bin="bcftools", 3257 tool="bcftools", 3258 bin_type="bin", 3259 config=config, 3260 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3261 ) 3262 if not bcftools_bin_command: 3263 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3264 log.error(msg_err) 3265 raise ValueError(msg_err) 3266 3267 # Config - BCFTools databases folders 3268 databases_folders = set( 3269 self.get_config() 3270 .get("folders", {}) 3271 .get("databases", {}) 3272 .get("annotations", ["."]) 3273 + self.get_config() 3274 .get("folders", {}) 3275 .get("databases", {}) 3276 .get("bcftools", ["."]) 3277 ) 3278 log.debug("Databases annotations: " + str(databases_folders)) 3279 3280 # Param 3281 annotations = ( 3282 self.get_param() 3283 .get("annotation", {}) 3284 .get("snpsift", {}) 3285 .get("annotations", None) 3286 ) 3287 log.debug("Annotations: " + str(annotations)) 3288 3289 # Assembly 3290 assembly = self.get_param().get( 3291 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3292 ) 3293 3294 # Data 3295 table_variants = self.get_table_variants() 3296 3297 # Check if not empty 3298 log.debug("Check if not empty") 3299 sql_query_chromosomes = ( 3300 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3301 ) 3302 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3303 if not sql_query_chromosomes_df["count"][0]: 3304 log.info(f"VCF empty") 3305 return 3306 3307 # VCF header 3308 vcf_reader = self.get_header() 3309 log.debug("Initial header: " + str(vcf_reader.infos)) 3310 3311 # Existing annotations 3312 for vcf_annotation in self.get_header().infos: 3313 3314 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3315 log.debug( 3316 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3317 ) 3318 3319 if annotations: 3320 3321 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3322 3323 # Export VCF file 3324 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3325 3326 # Init 3327 commands = {} 3328 3329 for annotation in annotations: 3330 annotation_fields = annotations[annotation] 3331 3332 # Annotation Name 3333 annotation_name = os.path.basename(annotation) 3334 3335 if not annotation_fields: 3336 annotation_fields = {"INFO": None} 3337 3338 log.debug(f"Annotation '{annotation_name}'") 3339 log.debug( 3340 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3341 ) 3342 3343 # Create Database 3344 database = Database( 3345 database=annotation, 3346 databases_folders=databases_folders, 3347 assembly=assembly, 3348 ) 3349 3350 # Find files 3351 db_file = database.get_database() 3352 db_file = full_path(db_file) 3353 db_hdr_file = database.get_header_file() 3354 db_hdr_file = full_path(db_hdr_file) 3355 db_file_type = database.get_format() 3356 db_tbi_file = f"{db_file}.tbi" 3357 db_file_compressed = database.is_compressed() 3358 3359 # Check if compressed 3360 if not db_file_compressed: 3361 log.error( 3362 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3363 ) 3364 raise ValueError( 3365 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3366 ) 3367 3368 # Check if indexed 3369 if not os.path.exists(db_tbi_file): 3370 log.error( 3371 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3372 ) 3373 raise ValueError( 3374 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3375 ) 3376 3377 # Check index - try to create if not exists 3378 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3379 log.error("Annotation failed: database not valid") 3380 log.error(f"Annotation annotation file: {db_file}") 3381 log.error(f"Annotation annotation header: {db_hdr_file}") 3382 log.error(f"Annotation annotation index: {db_tbi_file}") 3383 raise ValueError( 3384 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3385 ) 3386 else: 3387 3388 log.debug( 3389 f"Annotation '{annotation}' - file: " 3390 + str(db_file) 3391 + " and " 3392 + str(db_hdr_file) 3393 ) 3394 3395 # Load header as VCF object 3396 db_hdr_vcf = Variants(input=db_hdr_file) 3397 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3398 log.debug( 3399 "Annotation database header: " 3400 + str(db_hdr_vcf_header_infos) 3401 ) 3402 3403 # For all fields in database 3404 annotation_fields_full = False 3405 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3406 annotation_fields = { 3407 key: key for key in db_hdr_vcf_header_infos 3408 } 3409 log.debug( 3410 "Annotation database header - All annotations added: " 3411 + str(annotation_fields) 3412 ) 3413 annotation_fields_full = True 3414 3415 # # Create file for field rename 3416 # log.debug("Create file for field rename") 3417 # tmp_rename = NamedTemporaryFile( 3418 # prefix=self.get_prefix(), 3419 # dir=self.get_tmp_dir(), 3420 # suffix=".rename", 3421 # delete=False, 3422 # ) 3423 # tmp_rename_name = tmp_rename.name 3424 # tmp_files.append(tmp_rename_name) 3425 3426 # Number of fields 3427 nb_annotation_field = 0 3428 annotation_list = [] 3429 annotation_infos_rename_list = [] 3430 3431 for annotation_field in annotation_fields: 3432 3433 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3434 annotation_fields_new_name = annotation_fields.get( 3435 annotation_field, annotation_field 3436 ) 3437 if not annotation_fields_new_name: 3438 annotation_fields_new_name = annotation_field 3439 3440 # Check if field is in DB and if field is not elready in input data 3441 if ( 3442 annotation_field in db_hdr_vcf.get_header().infos 3443 and annotation_fields_new_name 3444 not in self.get_header().infos 3445 ): 3446 3447 log.info( 3448 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3449 ) 3450 3451 # BCFTools annotate param to rename fields 3452 if annotation_field != annotation_fields_new_name: 3453 annotation_infos_rename_list.append( 3454 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3455 ) 3456 3457 # Add INFO field to header 3458 db_hdr_vcf_header_infos_number = ( 3459 db_hdr_vcf_header_infos[annotation_field].num or "." 3460 ) 3461 db_hdr_vcf_header_infos_type = ( 3462 db_hdr_vcf_header_infos[annotation_field].type 3463 or "String" 3464 ) 3465 db_hdr_vcf_header_infos_description = ( 3466 db_hdr_vcf_header_infos[annotation_field].desc 3467 or f"{annotation_field} description" 3468 ) 3469 db_hdr_vcf_header_infos_source = ( 3470 db_hdr_vcf_header_infos[annotation_field].source 3471 or "unknown" 3472 ) 3473 db_hdr_vcf_header_infos_version = ( 3474 db_hdr_vcf_header_infos[annotation_field].version 3475 or "unknown" 3476 ) 3477 3478 vcf_reader.infos[annotation_fields_new_name] = ( 3479 vcf.parser._Info( 3480 annotation_fields_new_name, 3481 db_hdr_vcf_header_infos_number, 3482 db_hdr_vcf_header_infos_type, 3483 db_hdr_vcf_header_infos_description, 3484 db_hdr_vcf_header_infos_source, 3485 db_hdr_vcf_header_infos_version, 3486 self.code_type_map[ 3487 db_hdr_vcf_header_infos_type 3488 ], 3489 ) 3490 ) 3491 3492 annotation_list.append(annotation_field) 3493 3494 nb_annotation_field += 1 3495 3496 else: 3497 3498 if ( 3499 annotation_field 3500 not in db_hdr_vcf.get_header().infos 3501 ): 3502 log.warning( 3503 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3504 ) 3505 if ( 3506 annotation_fields_new_name 3507 in self.get_header().infos 3508 ): 3509 log.warning( 3510 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3511 ) 3512 3513 log.info( 3514 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3515 ) 3516 3517 annotation_infos = ",".join(annotation_list) 3518 3519 if annotation_infos != "": 3520 3521 # Annotated VCF (and error file) 3522 tmp_annotation_vcf_name = os.path.join( 3523 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3524 ) 3525 tmp_annotation_vcf_name_err = ( 3526 tmp_annotation_vcf_name + ".err" 3527 ) 3528 3529 # Add fields to annotate 3530 if not annotation_fields_full: 3531 annotation_infos_option = f"-info {annotation_infos}" 3532 else: 3533 annotation_infos_option = "" 3534 3535 # Info fields rename 3536 if annotation_infos_rename_list: 3537 annotation_infos_rename = " -c " + ",".join( 3538 annotation_infos_rename_list 3539 ) 3540 else: 3541 annotation_infos_rename = "" 3542 3543 # Annotate command 3544 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3545 3546 # Add command 3547 commands[command_annotate] = tmp_annotation_vcf_name 3548 3549 if commands: 3550 3551 # Export VCF file 3552 self.export_variant_vcf( 3553 vcf_file=tmp_vcf_name, 3554 remove_info=True, 3555 add_samples=False, 3556 index=True, 3557 ) 3558 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3559 3560 # Num command 3561 nb_command = 0 3562 3563 # Annotate 3564 for command_annotate in commands: 3565 nb_command += 1 3566 log.info( 3567 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3568 ) 3569 log.debug(f"command_annotate={command_annotate}") 3570 run_parallel_commands([command_annotate], threads) 3571 3572 # Debug 3573 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3574 3575 # Update variants 3576 log.info( 3577 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3578 ) 3579 self.update_from_vcf(commands[command_annotate]) 3580 3581 def annotation_bcftools(self, threads: int = None) -> None: 3582 """ 3583 This function annotate with bcftools 3584 3585 :param threads: Number of threads to use 3586 :return: the value of the variable "return_value". 3587 """ 3588 3589 # DEBUG 3590 log.debug("Start annotation with bcftools databases") 3591 3592 # Threads 3593 if not threads: 3594 threads = self.get_threads() 3595 log.debug("Threads: " + str(threads)) 3596 3597 # Config 3598 config = self.get_config() 3599 log.debug("Config: " + str(config)) 3600 3601 # DEBUG 3602 delete_tmp = True 3603 if self.get_config().get("verbosity", "warning") in ["debug"]: 3604 delete_tmp = False 3605 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3606 3607 # Config - BCFTools bin command 3608 bcftools_bin_command = get_bin_command( 3609 bin="bcftools", 3610 tool="bcftools", 3611 bin_type="bin", 3612 config=config, 3613 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3614 ) 3615 if not bcftools_bin_command: 3616 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3617 log.error(msg_err) 3618 raise ValueError(msg_err) 3619 3620 # Config - BCFTools databases folders 3621 databases_folders = set( 3622 self.get_config() 3623 .get("folders", {}) 3624 .get("databases", {}) 3625 .get("annotations", ["."]) 3626 + self.get_config() 3627 .get("folders", {}) 3628 .get("databases", {}) 3629 .get("bcftools", ["."]) 3630 ) 3631 log.debug("Databases annotations: " + str(databases_folders)) 3632 3633 # Param 3634 annotations = ( 3635 self.get_param() 3636 .get("annotation", {}) 3637 .get("bcftools", {}) 3638 .get("annotations", None) 3639 ) 3640 log.debug("Annotations: " + str(annotations)) 3641 3642 # Assembly 3643 assembly = self.get_param().get( 3644 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3645 ) 3646 3647 # Data 3648 table_variants = self.get_table_variants() 3649 3650 # Check if not empty 3651 log.debug("Check if not empty") 3652 sql_query_chromosomes = ( 3653 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3654 ) 3655 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3656 if not sql_query_chromosomes_df["count"][0]: 3657 log.info(f"VCF empty") 3658 return 3659 3660 # Export in VCF 3661 log.debug("Create initial file to annotate") 3662 tmp_vcf = NamedTemporaryFile( 3663 prefix=self.get_prefix(), 3664 dir=self.get_tmp_dir(), 3665 suffix=".vcf.gz", 3666 delete=False, 3667 ) 3668 tmp_vcf_name = tmp_vcf.name 3669 3670 # VCF header 3671 vcf_reader = self.get_header() 3672 log.debug("Initial header: " + str(vcf_reader.infos)) 3673 3674 # Existing annotations 3675 for vcf_annotation in self.get_header().infos: 3676 3677 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3678 log.debug( 3679 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3680 ) 3681 3682 if annotations: 3683 3684 tmp_ann_vcf_list = [] 3685 commands = [] 3686 tmp_files = [] 3687 err_files = [] 3688 3689 for annotation in annotations: 3690 annotation_fields = annotations[annotation] 3691 3692 # Annotation Name 3693 annotation_name = os.path.basename(annotation) 3694 3695 if not annotation_fields: 3696 annotation_fields = {"INFO": None} 3697 3698 log.debug(f"Annotation '{annotation_name}'") 3699 log.debug( 3700 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3701 ) 3702 3703 # Create Database 3704 database = Database( 3705 database=annotation, 3706 databases_folders=databases_folders, 3707 assembly=assembly, 3708 ) 3709 3710 # Find files 3711 db_file = database.get_database() 3712 db_file = full_path(db_file) 3713 db_hdr_file = database.get_header_file() 3714 db_hdr_file = full_path(db_hdr_file) 3715 db_file_type = database.get_format() 3716 db_tbi_file = f"{db_file}.tbi" 3717 db_file_compressed = database.is_compressed() 3718 3719 # Check if compressed 3720 if not db_file_compressed: 3721 log.error( 3722 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3723 ) 3724 raise ValueError( 3725 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3726 ) 3727 3728 # Check if indexed 3729 if not os.path.exists(db_tbi_file): 3730 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3731 raise ValueError( 3732 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3733 ) 3734 3735 # Check index - try to create if not exists 3736 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3737 log.error("Annotation failed: database not valid") 3738 log.error(f"Annotation annotation file: {db_file}") 3739 log.error(f"Annotation annotation header: {db_hdr_file}") 3740 log.error(f"Annotation annotation index: {db_tbi_file}") 3741 raise ValueError( 3742 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3743 ) 3744 else: 3745 3746 log.debug( 3747 f"Annotation '{annotation}' - file: " 3748 + str(db_file) 3749 + " and " 3750 + str(db_hdr_file) 3751 ) 3752 3753 # Load header as VCF object 3754 db_hdr_vcf = Variants(input=db_hdr_file) 3755 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3756 log.debug( 3757 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3758 ) 3759 3760 # For all fields in database 3761 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3762 annotation_fields = { 3763 key: key for key in db_hdr_vcf_header_infos 3764 } 3765 log.debug( 3766 "Annotation database header - All annotations added: " 3767 + str(annotation_fields) 3768 ) 3769 3770 # Number of fields 3771 nb_annotation_field = 0 3772 annotation_list = [] 3773 3774 for annotation_field in annotation_fields: 3775 3776 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3777 annotation_fields_new_name = annotation_fields.get( 3778 annotation_field, annotation_field 3779 ) 3780 if not annotation_fields_new_name: 3781 annotation_fields_new_name = annotation_field 3782 3783 # Check if field is in DB and if field is not elready in input data 3784 if ( 3785 annotation_field in db_hdr_vcf.get_header().infos 3786 and annotation_fields_new_name 3787 not in self.get_header().infos 3788 ): 3789 3790 log.info( 3791 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3792 ) 3793 3794 # Add INFO field to header 3795 db_hdr_vcf_header_infos_number = ( 3796 db_hdr_vcf_header_infos[annotation_field].num or "." 3797 ) 3798 db_hdr_vcf_header_infos_type = ( 3799 db_hdr_vcf_header_infos[annotation_field].type 3800 or "String" 3801 ) 3802 db_hdr_vcf_header_infos_description = ( 3803 db_hdr_vcf_header_infos[annotation_field].desc 3804 or f"{annotation_field} description" 3805 ) 3806 db_hdr_vcf_header_infos_source = ( 3807 db_hdr_vcf_header_infos[annotation_field].source 3808 or "unknown" 3809 ) 3810 db_hdr_vcf_header_infos_version = ( 3811 db_hdr_vcf_header_infos[annotation_field].version 3812 or "unknown" 3813 ) 3814 3815 vcf_reader.infos[annotation_fields_new_name] = ( 3816 vcf.parser._Info( 3817 annotation_fields_new_name, 3818 db_hdr_vcf_header_infos_number, 3819 db_hdr_vcf_header_infos_type, 3820 db_hdr_vcf_header_infos_description, 3821 db_hdr_vcf_header_infos_source, 3822 db_hdr_vcf_header_infos_version, 3823 self.code_type_map[db_hdr_vcf_header_infos_type], 3824 ) 3825 ) 3826 3827 # annotation_list.append(annotation_field) 3828 if annotation_field != annotation_fields_new_name: 3829 annotation_list.append( 3830 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3831 ) 3832 else: 3833 annotation_list.append(annotation_field) 3834 3835 nb_annotation_field += 1 3836 3837 else: 3838 3839 if annotation_field not in db_hdr_vcf.get_header().infos: 3840 log.warning( 3841 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3842 ) 3843 if annotation_fields_new_name in self.get_header().infos: 3844 log.warning( 3845 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3846 ) 3847 3848 log.info( 3849 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3850 ) 3851 3852 annotation_infos = ",".join(annotation_list) 3853 3854 if annotation_infos != "": 3855 3856 # Protect header for bcftools (remove "#CHROM" and variants line) 3857 log.debug("Protect Header file - remove #CHROM line if exists") 3858 tmp_header_vcf = NamedTemporaryFile( 3859 prefix=self.get_prefix(), 3860 dir=self.get_tmp_dir(), 3861 suffix=".hdr", 3862 delete=False, 3863 ) 3864 tmp_header_vcf_name = tmp_header_vcf.name 3865 tmp_files.append(tmp_header_vcf_name) 3866 # Command 3867 if db_hdr_file.endswith(".gz"): 3868 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3869 else: 3870 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3871 # Run 3872 run_parallel_commands([command_extract_header], 1) 3873 3874 # Find chomosomes 3875 log.debug("Find chromosomes ") 3876 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3877 sql_query_chromosomes_df = self.get_query_to_df( 3878 sql_query_chromosomes 3879 ) 3880 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3881 3882 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3883 3884 # BED columns in the annotation file 3885 if db_file_type in ["bed"]: 3886 annotation_infos = "CHROM,POS,POS," + annotation_infos 3887 3888 for chrom in chomosomes_list: 3889 3890 # Create BED on initial VCF 3891 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3892 tmp_bed = NamedTemporaryFile( 3893 prefix=self.get_prefix(), 3894 dir=self.get_tmp_dir(), 3895 suffix=".bed", 3896 delete=False, 3897 ) 3898 tmp_bed_name = tmp_bed.name 3899 tmp_files.append(tmp_bed_name) 3900 3901 # Detecte regions 3902 log.debug( 3903 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3904 ) 3905 window = 1000000 3906 sql_query_intervals_for_bed = f""" 3907 SELECT \"#CHROM\", 3908 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3909 \"POS\"+{window} 3910 FROM {table_variants} as table_variants 3911 WHERE table_variants.\"#CHROM\" = '{chrom}' 3912 """ 3913 regions = self.conn.execute( 3914 sql_query_intervals_for_bed 3915 ).fetchall() 3916 merged_regions = merge_regions(regions) 3917 log.debug( 3918 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3919 ) 3920 3921 header = ["#CHROM", "START", "END"] 3922 with open(tmp_bed_name, "w") as f: 3923 # Write the header with tab delimiter 3924 f.write("\t".join(header) + "\n") 3925 for d in merged_regions: 3926 # Write each data row with tab delimiter 3927 f.write("\t".join(map(str, d)) + "\n") 3928 3929 # Tmp files 3930 tmp_annotation_vcf = NamedTemporaryFile( 3931 prefix=self.get_prefix(), 3932 dir=self.get_tmp_dir(), 3933 suffix=".vcf.gz", 3934 delete=False, 3935 ) 3936 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3937 tmp_files.append(tmp_annotation_vcf_name) 3938 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3939 tmp_annotation_vcf_name_err = ( 3940 tmp_annotation_vcf_name + ".err" 3941 ) 3942 err_files.append(tmp_annotation_vcf_name_err) 3943 3944 # Annotate Command 3945 log.debug( 3946 f"Annotation '{annotation}' - add bcftools command" 3947 ) 3948 3949 # Command 3950 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3951 3952 # Add command 3953 commands.append(command_annotate) 3954 3955 # if some commands 3956 if commands: 3957 3958 # Export VCF file 3959 self.export_variant_vcf( 3960 vcf_file=tmp_vcf_name, 3961 remove_info=True, 3962 add_samples=False, 3963 index=True, 3964 ) 3965 3966 # Threads 3967 # calculate threads for annotated commands 3968 if commands: 3969 threads_bcftools_annotate = round(threads / len(commands)) 3970 else: 3971 threads_bcftools_annotate = 1 3972 3973 if not threads_bcftools_annotate: 3974 threads_bcftools_annotate = 1 3975 3976 # Add threads option to bcftools commands 3977 if threads_bcftools_annotate > 1: 3978 commands_threaded = [] 3979 for command in commands: 3980 commands_threaded.append( 3981 command.replace( 3982 f"{bcftools_bin_command} annotate ", 3983 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3984 ) 3985 ) 3986 commands = commands_threaded 3987 3988 # Command annotation multithreading 3989 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3990 log.info( 3991 f"Annotation - Annotation multithreaded in " 3992 + str(len(commands)) 3993 + " commands" 3994 ) 3995 3996 run_parallel_commands(commands, threads) 3997 3998 # Merge 3999 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4000 4001 if tmp_ann_vcf_list_cmd: 4002 4003 # Tmp file 4004 tmp_annotate_vcf = NamedTemporaryFile( 4005 prefix=self.get_prefix(), 4006 dir=self.get_tmp_dir(), 4007 suffix=".vcf.gz", 4008 delete=True, 4009 ) 4010 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4011 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4012 err_files.append(tmp_annotate_vcf_name_err) 4013 4014 # Tmp file remove command 4015 tmp_files_remove_command = "" 4016 if tmp_files: 4017 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4018 4019 # Command merge 4020 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4021 log.info( 4022 f"Annotation - Annotation merging " 4023 + str(len(commands)) 4024 + " annotated files" 4025 ) 4026 log.debug(f"Annotation - merge command: {merge_command}") 4027 run_parallel_commands([merge_command], 1) 4028 4029 # Error messages 4030 log.info(f"Error/Warning messages:") 4031 error_message_command_all = [] 4032 error_message_command_warning = [] 4033 error_message_command_err = [] 4034 for err_file in err_files: 4035 with open(err_file, "r") as f: 4036 for line in f: 4037 message = line.strip() 4038 error_message_command_all.append(message) 4039 if line.startswith("[W::"): 4040 error_message_command_warning.append(message) 4041 if line.startswith("[E::"): 4042 error_message_command_err.append( 4043 f"{err_file}: " + message 4044 ) 4045 # log info 4046 for message in list( 4047 set(error_message_command_err + error_message_command_warning) 4048 ): 4049 log.info(f" {message}") 4050 # debug info 4051 for message in list(set(error_message_command_all)): 4052 log.debug(f" {message}") 4053 # failed 4054 if len(error_message_command_err): 4055 log.error("Annotation failed: Error in commands") 4056 raise ValueError("Annotation failed: Error in commands") 4057 4058 # Update variants 4059 log.info(f"Annotation - Updating...") 4060 self.update_from_vcf(tmp_annotate_vcf_name) 4061 4062 def annotation_exomiser(self, threads: int = None) -> None: 4063 """ 4064 This function annotate with Exomiser 4065 4066 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4067 - "analysis" (dict/file): 4068 Full analysis dictionnary parameters (see Exomiser docs). 4069 Either a dict, or a file in JSON or YAML format. 4070 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4071 Default : None 4072 - "preset" (string): 4073 Analysis preset (available in config folder). 4074 Used if no full "analysis" is provided. 4075 Default: "exome" 4076 - "phenopacket" (dict/file): 4077 Samples and phenotipic features parameters (see Exomiser docs). 4078 Either a dict, or a file in JSON or YAML format. 4079 Default: None 4080 - "subject" (dict): 4081 Sample parameters (see Exomiser docs). 4082 Example: 4083 "subject": 4084 { 4085 "id": "ISDBM322017", 4086 "sex": "FEMALE" 4087 } 4088 Default: None 4089 - "sample" (string): 4090 Sample name to construct "subject" section: 4091 "subject": 4092 { 4093 "id": "<sample>", 4094 "sex": "UNKNOWN_SEX" 4095 } 4096 Default: None 4097 - "phenotypicFeatures" (dict) 4098 Phenotypic features to construct "subject" section. 4099 Example: 4100 "phenotypicFeatures": 4101 [ 4102 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4103 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4104 ] 4105 - "hpo" (list) 4106 List of HPO ids as phenotypic features. 4107 Example: 4108 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4109 Default: [] 4110 - "outputOptions" (dict): 4111 Output options (see Exomiser docs). 4112 Default: 4113 "output_options" = 4114 { 4115 "outputContributingVariantsOnly": False, 4116 "numGenes": 0, 4117 "outputFormats": ["TSV_VARIANT", "VCF"] 4118 } 4119 - "transcript_source" (string): 4120 Transcript source (either "refseq", "ucsc", "ensembl") 4121 Default: "refseq" 4122 - "exomiser_to_info" (boolean): 4123 Add exomiser TSV file columns as INFO fields in VCF. 4124 Default: False 4125 - "release" (string): 4126 Exomise database release. 4127 If not exists, database release will be downloaded (take a while). 4128 Default: None (provided by application.properties configuration file) 4129 - "exomiser_application_properties" (file): 4130 Exomiser configuration file (see Exomiser docs). 4131 Useful to automatically download databases (especially for specific genome databases). 4132 4133 Notes: 4134 - If no sample in parameters, first sample in VCF will be chosen 4135 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4136 4137 :param threads: The number of threads to use 4138 :return: None. 4139 """ 4140 4141 # DEBUG 4142 log.debug("Start annotation with Exomiser databases") 4143 4144 # Threads 4145 if not threads: 4146 threads = self.get_threads() 4147 log.debug("Threads: " + str(threads)) 4148 4149 # Config 4150 config = self.get_config() 4151 log.debug("Config: " + str(config)) 4152 4153 # Config - Folders - Databases 4154 databases_folders = ( 4155 config.get("folders", {}) 4156 .get("databases", {}) 4157 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4158 ) 4159 databases_folders = full_path(databases_folders) 4160 if not os.path.exists(databases_folders): 4161 log.error(f"Databases annotations: {databases_folders} NOT found") 4162 log.debug("Databases annotations: " + str(databases_folders)) 4163 4164 # Config - Exomiser 4165 exomiser_bin_command = get_bin_command( 4166 bin="exomiser-cli*.jar", 4167 tool="exomiser", 4168 bin_type="jar", 4169 config=config, 4170 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4171 ) 4172 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4173 if not exomiser_bin_command: 4174 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4175 log.error(msg_err) 4176 raise ValueError(msg_err) 4177 4178 # Param 4179 param = self.get_param() 4180 log.debug("Param: " + str(param)) 4181 4182 # Param - Exomiser 4183 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4184 log.debug(f"Param Exomiser: {param_exomiser}") 4185 4186 # Param - Assembly 4187 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4188 log.debug("Assembly: " + str(assembly)) 4189 4190 # Data 4191 table_variants = self.get_table_variants() 4192 4193 # Check if not empty 4194 log.debug("Check if not empty") 4195 sql_query_chromosomes = ( 4196 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4197 ) 4198 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4199 log.info(f"VCF empty") 4200 return False 4201 4202 # VCF header 4203 vcf_reader = self.get_header() 4204 log.debug("Initial header: " + str(vcf_reader.infos)) 4205 4206 # Samples 4207 samples = self.get_header_sample_list() 4208 if not samples: 4209 log.error("No Samples in VCF") 4210 return False 4211 log.debug(f"Samples: {samples}") 4212 4213 # Memory limit 4214 memory_limit = self.get_memory("8G") 4215 log.debug(f"memory_limit: {memory_limit}") 4216 4217 # Exomiser java options 4218 exomiser_java_options = ( 4219 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4220 ) 4221 log.debug(f"Exomiser java options: {exomiser_java_options}") 4222 4223 # Download Exomiser (if not exists) 4224 exomiser_release = param_exomiser.get("release", None) 4225 exomiser_application_properties = param_exomiser.get( 4226 "exomiser_application_properties", None 4227 ) 4228 databases_download_exomiser( 4229 assemblies=[assembly], 4230 exomiser_folder=databases_folders, 4231 exomiser_release=exomiser_release, 4232 exomiser_phenotype_release=exomiser_release, 4233 exomiser_application_properties=exomiser_application_properties, 4234 ) 4235 4236 # Force annotation 4237 force_update_annotation = True 4238 4239 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4240 log.debug("Start annotation Exomiser") 4241 4242 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4243 4244 # tmp_dir = "/tmp/exomiser" 4245 4246 ### ANALYSIS ### 4247 ################ 4248 4249 # Create analysis.json through analysis dict 4250 # either analysis in param or by default 4251 # depending on preset exome/genome) 4252 4253 # Init analysis dict 4254 param_exomiser_analysis_dict = {} 4255 4256 # analysis from param 4257 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4258 param_exomiser_analysis = full_path(param_exomiser_analysis) 4259 4260 # If analysis in param -> load anlaysis json 4261 if param_exomiser_analysis: 4262 4263 # If param analysis is a file and exists 4264 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4265 param_exomiser_analysis 4266 ): 4267 # Load analysis file into analysis dict (either yaml or json) 4268 with open(param_exomiser_analysis) as json_file: 4269 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4270 4271 # If param analysis is a dict 4272 elif isinstance(param_exomiser_analysis, dict): 4273 # Load analysis dict into analysis dict (either yaml or json) 4274 param_exomiser_analysis_dict = param_exomiser_analysis 4275 4276 # Error analysis type 4277 else: 4278 log.error(f"Analysis type unknown. Check param file.") 4279 raise ValueError(f"Analysis type unknown. Check param file.") 4280 4281 # Case no input analysis config file/dict 4282 # Use preset (exome/genome) to open default config file 4283 if not param_exomiser_analysis_dict: 4284 4285 # default preset 4286 default_preset = "exome" 4287 4288 # Get param preset or default preset 4289 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4290 4291 # Try to find if preset is a file 4292 if os.path.exists(param_exomiser_preset): 4293 # Preset file is provided in full path 4294 param_exomiser_analysis_default_config_file = ( 4295 param_exomiser_preset 4296 ) 4297 # elif os.path.exists(full_path(param_exomiser_preset)): 4298 # # Preset file is provided in full path 4299 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4300 elif os.path.exists( 4301 os.path.join(folder_config, param_exomiser_preset) 4302 ): 4303 # Preset file is provided a basename in config folder (can be a path with subfolders) 4304 param_exomiser_analysis_default_config_file = os.path.join( 4305 folder_config, param_exomiser_preset 4306 ) 4307 else: 4308 # Construct preset file 4309 param_exomiser_analysis_default_config_file = os.path.join( 4310 folder_config, 4311 f"preset-{param_exomiser_preset}-analysis.json", 4312 ) 4313 4314 # If preset file exists 4315 param_exomiser_analysis_default_config_file = full_path( 4316 param_exomiser_analysis_default_config_file 4317 ) 4318 if os.path.exists(param_exomiser_analysis_default_config_file): 4319 # Load prest file into analysis dict (either yaml or json) 4320 with open( 4321 param_exomiser_analysis_default_config_file 4322 ) as json_file: 4323 # param_exomiser_analysis_dict[""] = json.load(json_file) 4324 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4325 json_file 4326 ) 4327 4328 # Error preset file 4329 else: 4330 log.error( 4331 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4332 ) 4333 raise ValueError( 4334 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4335 ) 4336 4337 # If no analysis dict created 4338 if not param_exomiser_analysis_dict: 4339 log.error(f"No analysis config") 4340 raise ValueError(f"No analysis config") 4341 4342 # Log 4343 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4344 4345 ### PHENOPACKET ### 4346 ################### 4347 4348 # If no PhenoPacket in analysis dict -> check in param 4349 if "phenopacket" not in param_exomiser_analysis_dict: 4350 4351 # If PhenoPacket in param -> load anlaysis json 4352 if param_exomiser.get("phenopacket", None): 4353 4354 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4355 param_exomiser_phenopacket = full_path( 4356 param_exomiser_phenopacket 4357 ) 4358 4359 # If param phenopacket is a file and exists 4360 if isinstance( 4361 param_exomiser_phenopacket, str 4362 ) and os.path.exists(param_exomiser_phenopacket): 4363 # Load phenopacket file into analysis dict (either yaml or json) 4364 with open(param_exomiser_phenopacket) as json_file: 4365 param_exomiser_analysis_dict["phenopacket"] = ( 4366 yaml.safe_load(json_file) 4367 ) 4368 4369 # If param phenopacket is a dict 4370 elif isinstance(param_exomiser_phenopacket, dict): 4371 # Load phenopacket dict into analysis dict (either yaml or json) 4372 param_exomiser_analysis_dict["phenopacket"] = ( 4373 param_exomiser_phenopacket 4374 ) 4375 4376 # Error phenopacket type 4377 else: 4378 log.error(f"Phenopacket type unknown. Check param file.") 4379 raise ValueError( 4380 f"Phenopacket type unknown. Check param file." 4381 ) 4382 4383 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4384 if "phenopacket" not in param_exomiser_analysis_dict: 4385 4386 # Init PhenoPacket 4387 param_exomiser_analysis_dict["phenopacket"] = { 4388 "id": "analysis", 4389 "proband": {}, 4390 } 4391 4392 ### Add subject ### 4393 4394 # If subject exists 4395 param_exomiser_subject = param_exomiser.get("subject", {}) 4396 4397 # If subject not exists -> found sample ID 4398 if not param_exomiser_subject: 4399 4400 # Found sample ID in param 4401 sample = param_exomiser.get("sample", None) 4402 4403 # Find sample ID (first sample) 4404 if not sample: 4405 sample_list = self.get_header_sample_list() 4406 if len(sample_list) > 0: 4407 sample = sample_list[0] 4408 else: 4409 log.error(f"No sample found") 4410 raise ValueError(f"No sample found") 4411 4412 # Create subject 4413 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4414 4415 # Add to dict 4416 param_exomiser_analysis_dict["phenopacket"][ 4417 "subject" 4418 ] = param_exomiser_subject 4419 4420 ### Add "phenotypicFeatures" ### 4421 4422 # If phenotypicFeatures exists 4423 param_exomiser_phenotypicfeatures = param_exomiser.get( 4424 "phenotypicFeatures", [] 4425 ) 4426 4427 # If phenotypicFeatures not exists -> Try to infer from hpo list 4428 if not param_exomiser_phenotypicfeatures: 4429 4430 # Found HPO in param 4431 param_exomiser_hpo = param_exomiser.get("hpo", []) 4432 4433 # Split HPO if list in string format separated by comma 4434 if isinstance(param_exomiser_hpo, str): 4435 param_exomiser_hpo = param_exomiser_hpo.split(",") 4436 4437 # Create HPO list 4438 for hpo in param_exomiser_hpo: 4439 hpo_clean = re.sub("[^0-9]", "", hpo) 4440 param_exomiser_phenotypicfeatures.append( 4441 { 4442 "type": { 4443 "id": f"HP:{hpo_clean}", 4444 "label": f"HP:{hpo_clean}", 4445 } 4446 } 4447 ) 4448 4449 # Add to dict 4450 param_exomiser_analysis_dict["phenopacket"][ 4451 "phenotypicFeatures" 4452 ] = param_exomiser_phenotypicfeatures 4453 4454 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4455 if not param_exomiser_phenotypicfeatures: 4456 for step in param_exomiser_analysis_dict.get( 4457 "analysis", {} 4458 ).get("steps", []): 4459 if "hiPhivePrioritiser" in step: 4460 param_exomiser_analysis_dict.get("analysis", {}).get( 4461 "steps", [] 4462 ).remove(step) 4463 4464 ### Add Input File ### 4465 4466 # Initial file name and htsFiles 4467 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4468 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4469 { 4470 "uri": tmp_vcf_name, 4471 "htsFormat": "VCF", 4472 "genomeAssembly": assembly, 4473 } 4474 ] 4475 4476 ### Add metaData ### 4477 4478 # If metaData not in analysis dict 4479 if "metaData" not in param_exomiser_analysis_dict: 4480 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4481 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4482 "createdBy": "howard", 4483 "phenopacketSchemaVersion": 1, 4484 } 4485 4486 ### OutputOptions ### 4487 4488 # Init output result folder 4489 output_results = os.path.join(tmp_dir, "results") 4490 4491 # If no outputOptions in analysis dict 4492 if "outputOptions" not in param_exomiser_analysis_dict: 4493 4494 # default output formats 4495 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4496 4497 # Get outputOptions in param 4498 output_options = param_exomiser.get("outputOptions", None) 4499 4500 # If no output_options in param -> check 4501 if not output_options: 4502 output_options = { 4503 "outputContributingVariantsOnly": False, 4504 "numGenes": 0, 4505 "outputFormats": defaut_output_formats, 4506 } 4507 4508 # Replace outputDirectory in output options 4509 output_options["outputDirectory"] = output_results 4510 output_options["outputFileName"] = "howard" 4511 4512 # Add outputOptions in analysis dict 4513 param_exomiser_analysis_dict["outputOptions"] = output_options 4514 4515 else: 4516 4517 # Replace output_results and output format (if exists in param) 4518 param_exomiser_analysis_dict["outputOptions"][ 4519 "outputDirectory" 4520 ] = output_results 4521 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4522 list( 4523 set( 4524 param_exomiser_analysis_dict.get( 4525 "outputOptions", {} 4526 ).get("outputFormats", []) 4527 + ["TSV_VARIANT", "VCF"] 4528 ) 4529 ) 4530 ) 4531 4532 # log 4533 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4534 4535 ### ANALYSIS FILE ### 4536 ##################### 4537 4538 ### Full JSON analysis config file ### 4539 4540 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4541 with open(exomiser_analysis, "w") as fp: 4542 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4543 4544 ### SPLIT analysis and sample config files 4545 4546 # Splitted analysis dict 4547 param_exomiser_analysis_dict_for_split = ( 4548 param_exomiser_analysis_dict.copy() 4549 ) 4550 4551 # Phenopacket JSON file 4552 exomiser_analysis_phenopacket = os.path.join( 4553 tmp_dir, "analysis_phenopacket.json" 4554 ) 4555 with open(exomiser_analysis_phenopacket, "w") as fp: 4556 json.dump( 4557 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4558 fp, 4559 indent=4, 4560 ) 4561 4562 # Analysis JSON file without Phenopacket parameters 4563 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4564 exomiser_analysis_analysis = os.path.join( 4565 tmp_dir, "analysis_analysis.json" 4566 ) 4567 with open(exomiser_analysis_analysis, "w") as fp: 4568 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4569 4570 ### INITAL VCF file ### 4571 ####################### 4572 4573 ### Create list of samples to use and include inti initial VCF file #### 4574 4575 # Subject (main sample) 4576 # Get sample ID in analysis dict 4577 sample_subject = ( 4578 param_exomiser_analysis_dict.get("phenopacket", {}) 4579 .get("subject", {}) 4580 .get("id", None) 4581 ) 4582 sample_proband = ( 4583 param_exomiser_analysis_dict.get("phenopacket", {}) 4584 .get("proband", {}) 4585 .get("subject", {}) 4586 .get("id", None) 4587 ) 4588 sample = [] 4589 if sample_subject: 4590 sample.append(sample_subject) 4591 if sample_proband: 4592 sample.append(sample_proband) 4593 4594 # Get sample ID within Pedigree 4595 pedigree_persons_list = ( 4596 param_exomiser_analysis_dict.get("phenopacket", {}) 4597 .get("pedigree", {}) 4598 .get("persons", {}) 4599 ) 4600 4601 # Create list with all sample ID in pedigree (if exists) 4602 pedigree_persons = [] 4603 for person in pedigree_persons_list: 4604 pedigree_persons.append(person.get("individualId")) 4605 4606 # Concat subject sample ID and samples ID in pedigreesamples 4607 samples = list(set(sample + pedigree_persons)) 4608 4609 # Check if sample list is not empty 4610 if not samples: 4611 log.error(f"No samples found") 4612 raise ValueError(f"No samples found") 4613 4614 # Create VCF with sample (either sample in param or first one by default) 4615 # Export VCF file 4616 self.export_variant_vcf( 4617 vcf_file=tmp_vcf_name, 4618 remove_info=True, 4619 add_samples=True, 4620 list_samples=samples, 4621 index=False, 4622 ) 4623 4624 ### Execute Exomiser ### 4625 ######################## 4626 4627 # Init command 4628 exomiser_command = "" 4629 4630 # Command exomiser options 4631 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4632 4633 # Release 4634 exomiser_release = param_exomiser.get("release", None) 4635 if exomiser_release: 4636 # phenotype data version 4637 exomiser_options += ( 4638 f" --exomiser.phenotype.data-version={exomiser_release} " 4639 ) 4640 # data version 4641 exomiser_options += ( 4642 f" --exomiser.{assembly}.data-version={exomiser_release} " 4643 ) 4644 # variant white list 4645 variant_white_list_file = ( 4646 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4647 ) 4648 if os.path.exists( 4649 os.path.join( 4650 databases_folders, assembly, variant_white_list_file 4651 ) 4652 ): 4653 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4654 4655 # transcript_source 4656 transcript_source = param_exomiser.get( 4657 "transcript_source", None 4658 ) # ucsc, refseq, ensembl 4659 if transcript_source: 4660 exomiser_options += ( 4661 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4662 ) 4663 4664 # If analysis contain proband param 4665 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4666 "proband", {} 4667 ): 4668 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4669 4670 # If no proband (usually uniq sample) 4671 else: 4672 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4673 4674 # Log 4675 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4676 4677 # Run command 4678 result = subprocess.call( 4679 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4680 ) 4681 if result: 4682 log.error("Exomiser command failed") 4683 raise ValueError("Exomiser command failed") 4684 4685 ### RESULTS ### 4686 ############### 4687 4688 ### Annotate with TSV fields ### 4689 4690 # Init result tsv file 4691 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4692 4693 # Init result tsv file 4694 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4695 4696 # Parse TSV file and explode columns in INFO field 4697 if exomiser_to_info and os.path.exists(output_results_tsv): 4698 4699 # Log 4700 log.debug("Exomiser columns to VCF INFO field") 4701 4702 # Retrieve columns and types 4703 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4704 output_results_tsv_df = self.get_query_to_df(query) 4705 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4706 4707 # Init concat fields for update 4708 sql_query_update_concat_fields = [] 4709 4710 # Fields to avoid 4711 fields_to_avoid = [ 4712 "CONTIG", 4713 "START", 4714 "END", 4715 "REF", 4716 "ALT", 4717 "QUAL", 4718 "FILTER", 4719 "GENOTYPE", 4720 ] 4721 4722 # List all columns to add into header 4723 for header_column in output_results_tsv_columns: 4724 4725 # If header column is enable 4726 if header_column not in fields_to_avoid: 4727 4728 # Header info type 4729 header_info_type = "String" 4730 header_column_df = output_results_tsv_df[header_column] 4731 header_column_df_dtype = header_column_df.dtype 4732 if header_column_df_dtype == object: 4733 if ( 4734 pd.to_numeric(header_column_df, errors="coerce") 4735 .notnull() 4736 .all() 4737 ): 4738 header_info_type = "Float" 4739 else: 4740 header_info_type = "Integer" 4741 4742 # Header info 4743 characters_to_validate = ["-"] 4744 pattern = "[" + "".join(characters_to_validate) + "]" 4745 header_info_name = re.sub( 4746 pattern, 4747 "_", 4748 f"Exomiser_{header_column}".replace("#", ""), 4749 ) 4750 header_info_number = "." 4751 header_info_description = ( 4752 f"Exomiser {header_column} annotation" 4753 ) 4754 header_info_source = "Exomiser" 4755 header_info_version = "unknown" 4756 header_info_code = CODE_TYPE_MAP[header_info_type] 4757 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4758 header_info_name, 4759 header_info_number, 4760 header_info_type, 4761 header_info_description, 4762 header_info_source, 4763 header_info_version, 4764 header_info_code, 4765 ) 4766 4767 # Add field to add for update to concat fields 4768 sql_query_update_concat_fields.append( 4769 f""" 4770 CASE 4771 WHEN table_parquet."{header_column}" NOT IN ('','.') 4772 THEN concat( 4773 '{header_info_name}=', 4774 table_parquet."{header_column}", 4775 ';' 4776 ) 4777 4778 ELSE '' 4779 END 4780 """ 4781 ) 4782 4783 # Update query 4784 sql_query_update = f""" 4785 UPDATE {table_variants} as table_variants 4786 SET INFO = concat( 4787 CASE 4788 WHEN INFO NOT IN ('', '.') 4789 THEN INFO 4790 ELSE '' 4791 END, 4792 CASE 4793 WHEN table_variants.INFO NOT IN ('','.') 4794 THEN ';' 4795 ELSE '' 4796 END, 4797 ( 4798 SELECT 4799 concat( 4800 {",".join(sql_query_update_concat_fields)} 4801 ) 4802 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4803 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4804 AND table_parquet.\"START\" = table_variants.\"POS\" 4805 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4806 AND table_parquet.\"REF\" = table_variants.\"REF\" 4807 ) 4808 ) 4809 ; 4810 """ 4811 4812 # Update 4813 self.conn.execute(sql_query_update) 4814 4815 ### Annotate with VCF INFO field ### 4816 4817 # Init result VCF file 4818 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4819 4820 # If VCF exists 4821 if os.path.exists(output_results_vcf): 4822 4823 # Log 4824 log.debug("Exomiser result VCF update variants") 4825 4826 # Find Exomiser INFO field annotation in header 4827 with gzip.open(output_results_vcf, "rt") as f: 4828 header_list = self.read_vcf_header(f) 4829 exomiser_vcf_header = vcf.Reader( 4830 io.StringIO("\n".join(header_list)) 4831 ) 4832 4833 # Add annotation INFO field to header 4834 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4835 4836 # Update variants with VCF 4837 self.update_from_vcf(output_results_vcf) 4838 4839 return True 4840 4841 def annotation_snpeff(self, threads: int = None) -> None: 4842 """ 4843 This function annotate with snpEff 4844 4845 :param threads: The number of threads to use 4846 :return: the value of the variable "return_value". 4847 """ 4848 4849 # DEBUG 4850 log.debug("Start annotation with snpeff databases") 4851 4852 # Threads 4853 if not threads: 4854 threads = self.get_threads() 4855 log.debug("Threads: " + str(threads)) 4856 4857 # DEBUG 4858 delete_tmp = True 4859 if self.get_config().get("verbosity", "warning") in ["debug"]: 4860 delete_tmp = False 4861 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4862 4863 # Config 4864 config = self.get_config() 4865 log.debug("Config: " + str(config)) 4866 4867 # Config - Folders - Databases 4868 databases_folders = ( 4869 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4870 ) 4871 log.debug("Databases annotations: " + str(databases_folders)) 4872 4873 # # Config - Java 4874 # java_bin = get_bin( 4875 # tool="java", 4876 # bin="java", 4877 # bin_type="bin", 4878 # config=config, 4879 # default_folder="/usr/bin", 4880 # ) 4881 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4882 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4883 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4884 4885 # # Config - snpEff bin 4886 # snpeff_jar = get_bin( 4887 # tool="snpeff", 4888 # bin="snpEff.jar", 4889 # bin_type="jar", 4890 # config=config, 4891 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4892 # ) 4893 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4894 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4895 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4896 4897 # Config - snpEff bin command 4898 snpeff_bin_command = get_bin_command( 4899 bin="snpEff.jar", 4900 tool="snpeff", 4901 bin_type="jar", 4902 config=config, 4903 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4904 ) 4905 if not snpeff_bin_command: 4906 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4907 log.error(msg_err) 4908 raise ValueError(msg_err) 4909 4910 # Config - snpEff databases 4911 snpeff_databases = ( 4912 config.get("folders", {}) 4913 .get("databases", {}) 4914 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4915 ) 4916 snpeff_databases = full_path(snpeff_databases) 4917 if snpeff_databases is not None and snpeff_databases != "": 4918 log.debug(f"Create snpEff databases folder") 4919 if not os.path.exists(snpeff_databases): 4920 os.makedirs(snpeff_databases) 4921 4922 # Param 4923 param = self.get_param() 4924 log.debug("Param: " + str(param)) 4925 4926 # Param 4927 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4928 log.debug("Options: " + str(options)) 4929 4930 # Param - Assembly 4931 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4932 4933 # Param - Options 4934 snpeff_options = ( 4935 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4936 ) 4937 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4938 snpeff_csvstats = ( 4939 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4940 ) 4941 if snpeff_stats: 4942 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4943 snpeff_stats = full_path(snpeff_stats) 4944 snpeff_options += f" -stats {snpeff_stats}" 4945 if snpeff_csvstats: 4946 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4947 snpeff_csvstats = full_path(snpeff_csvstats) 4948 snpeff_options += f" -csvStats {snpeff_csvstats}" 4949 4950 # Data 4951 table_variants = self.get_table_variants() 4952 4953 # Check if not empty 4954 log.debug("Check if not empty") 4955 sql_query_chromosomes = ( 4956 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4957 ) 4958 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4959 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4960 log.info(f"VCF empty") 4961 return 4962 4963 # Export in VCF 4964 log.debug("Create initial file to annotate") 4965 tmp_vcf = NamedTemporaryFile( 4966 prefix=self.get_prefix(), 4967 dir=self.get_tmp_dir(), 4968 suffix=".vcf.gz", 4969 delete=True, 4970 ) 4971 tmp_vcf_name = tmp_vcf.name 4972 4973 # VCF header 4974 vcf_reader = self.get_header() 4975 log.debug("Initial header: " + str(vcf_reader.infos)) 4976 4977 # Existing annotations 4978 for vcf_annotation in self.get_header().infos: 4979 4980 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4981 log.debug( 4982 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4983 ) 4984 4985 # Memory limit 4986 # if config.get("memory", None): 4987 # memory_limit = config.get("memory", "8G") 4988 # else: 4989 # memory_limit = "8G" 4990 memory_limit = self.get_memory("8G") 4991 log.debug(f"memory_limit: {memory_limit}") 4992 4993 # snpEff java options 4994 snpeff_java_options = ( 4995 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4996 ) 4997 log.debug(f"Exomiser java options: {snpeff_java_options}") 4998 4999 force_update_annotation = True 5000 5001 if "ANN" not in self.get_header().infos or force_update_annotation: 5002 5003 # Check snpEff database 5004 log.debug(f"Check snpEff databases {[assembly]}") 5005 databases_download_snpeff( 5006 folder=snpeff_databases, assemblies=[assembly], config=config 5007 ) 5008 5009 # Export VCF file 5010 self.export_variant_vcf( 5011 vcf_file=tmp_vcf_name, 5012 remove_info=True, 5013 add_samples=False, 5014 index=True, 5015 ) 5016 5017 # Tmp file 5018 err_files = [] 5019 tmp_annotate_vcf = NamedTemporaryFile( 5020 prefix=self.get_prefix(), 5021 dir=self.get_tmp_dir(), 5022 suffix=".vcf", 5023 delete=False, 5024 ) 5025 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5026 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5027 err_files.append(tmp_annotate_vcf_name_err) 5028 5029 # Command 5030 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5031 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5032 run_parallel_commands([snpeff_command], 1) 5033 5034 # Error messages 5035 log.info(f"Error/Warning messages:") 5036 error_message_command_all = [] 5037 error_message_command_warning = [] 5038 error_message_command_err = [] 5039 for err_file in err_files: 5040 with open(err_file, "r") as f: 5041 for line in f: 5042 message = line.strip() 5043 error_message_command_all.append(message) 5044 if line.startswith("[W::"): 5045 error_message_command_warning.append(message) 5046 if line.startswith("[E::"): 5047 error_message_command_err.append(f"{err_file}: " + message) 5048 # log info 5049 for message in list( 5050 set(error_message_command_err + error_message_command_warning) 5051 ): 5052 log.info(f" {message}") 5053 # debug info 5054 for message in list(set(error_message_command_all)): 5055 log.debug(f" {message}") 5056 # failed 5057 if len(error_message_command_err): 5058 log.error("Annotation failed: Error in commands") 5059 raise ValueError("Annotation failed: Error in commands") 5060 5061 # Find annotation in header 5062 with open(tmp_annotate_vcf_name, "rt") as f: 5063 header_list = self.read_vcf_header(f) 5064 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5065 5066 for ann in annovar_vcf_header.infos: 5067 if ann not in self.get_header().infos: 5068 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5069 5070 # Update variants 5071 log.info(f"Annotation - Updating...") 5072 self.update_from_vcf(tmp_annotate_vcf_name) 5073 5074 else: 5075 if "ANN" in self.get_header().infos: 5076 log.debug(f"Existing snpEff annotations in VCF") 5077 if force_update_annotation: 5078 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5079 5080 def annotation_annovar(self, threads: int = None) -> None: 5081 """ 5082 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5083 annotations 5084 5085 :param threads: number of threads to use 5086 :return: the value of the variable "return_value". 5087 """ 5088 5089 # DEBUG 5090 log.debug("Start annotation with Annovar databases") 5091 5092 # Threads 5093 if not threads: 5094 threads = self.get_threads() 5095 log.debug("Threads: " + str(threads)) 5096 5097 # Tmp en Err files 5098 tmp_files = [] 5099 err_files = [] 5100 5101 # DEBUG 5102 delete_tmp = True 5103 if self.get_config().get("verbosity", "warning") in ["debug"]: 5104 delete_tmp = False 5105 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5106 5107 # Config 5108 config = self.get_config() 5109 log.debug("Config: " + str(config)) 5110 5111 # Config - Folders - Databases 5112 databases_folders = ( 5113 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5114 ) 5115 log.debug("Databases annotations: " + str(databases_folders)) 5116 5117 # Config - annovar bin command 5118 annovar_bin_command = get_bin_command( 5119 bin="table_annovar.pl", 5120 tool="annovar", 5121 bin_type="perl", 5122 config=config, 5123 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5124 ) 5125 if not annovar_bin_command: 5126 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5127 log.error(msg_err) 5128 raise ValueError(msg_err) 5129 5130 # Config - BCFTools bin command 5131 bcftools_bin_command = get_bin_command( 5132 bin="bcftools", 5133 tool="bcftools", 5134 bin_type="bin", 5135 config=config, 5136 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5137 ) 5138 if not bcftools_bin_command: 5139 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5140 log.error(msg_err) 5141 raise ValueError(msg_err) 5142 5143 # Config - annovar databases 5144 annovar_databases = ( 5145 config.get("folders", {}) 5146 .get("databases", {}) 5147 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5148 ) 5149 annovar_databases = full_path(annovar_databases) 5150 if annovar_databases != "" and not os.path.exists(annovar_databases): 5151 os.makedirs(annovar_databases) 5152 5153 # Param 5154 param = self.get_param() 5155 log.debug("Param: " + str(param)) 5156 5157 # Param - options 5158 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5159 log.debug("Options: " + str(options)) 5160 5161 # Param - annotations 5162 annotations = ( 5163 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5164 ) 5165 log.debug("Annotations: " + str(annotations)) 5166 5167 # Param - Assembly 5168 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5169 5170 # Annovar database assembly 5171 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5172 if annovar_databases_assembly != "" and not os.path.exists( 5173 annovar_databases_assembly 5174 ): 5175 os.makedirs(annovar_databases_assembly) 5176 5177 # Data 5178 table_variants = self.get_table_variants() 5179 5180 # Check if not empty 5181 log.debug("Check if not empty") 5182 sql_query_chromosomes = ( 5183 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5184 ) 5185 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5186 if not sql_query_chromosomes_df["count"][0]: 5187 log.info(f"VCF empty") 5188 return 5189 5190 # VCF header 5191 vcf_reader = self.get_header() 5192 log.debug("Initial header: " + str(vcf_reader.infos)) 5193 5194 # Existing annotations 5195 for vcf_annotation in self.get_header().infos: 5196 5197 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5198 log.debug( 5199 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5200 ) 5201 5202 force_update_annotation = True 5203 5204 if annotations: 5205 5206 commands = [] 5207 tmp_annotates_vcf_name_list = [] 5208 5209 # Export in VCF 5210 log.debug("Create initial file to annotate") 5211 tmp_vcf = NamedTemporaryFile( 5212 prefix=self.get_prefix(), 5213 dir=self.get_tmp_dir(), 5214 suffix=".vcf.gz", 5215 delete=False, 5216 ) 5217 tmp_vcf_name = tmp_vcf.name 5218 tmp_files.append(tmp_vcf_name) 5219 tmp_files.append(tmp_vcf_name + ".tbi") 5220 5221 # Export VCF file 5222 self.export_variant_vcf( 5223 vcf_file=tmp_vcf_name, 5224 remove_info=".", 5225 add_samples=False, 5226 index=True, 5227 ) 5228 5229 # Create file for field rename 5230 log.debug("Create file for field rename") 5231 tmp_rename = NamedTemporaryFile( 5232 prefix=self.get_prefix(), 5233 dir=self.get_tmp_dir(), 5234 suffix=".rename", 5235 delete=False, 5236 ) 5237 tmp_rename_name = tmp_rename.name 5238 tmp_files.append(tmp_rename_name) 5239 5240 # Check Annovar database 5241 log.debug( 5242 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5243 ) 5244 databases_download_annovar( 5245 folder=annovar_databases, 5246 files=list(annotations.keys()), 5247 assemblies=[assembly], 5248 ) 5249 5250 for annotation in annotations: 5251 annotation_fields = annotations[annotation] 5252 5253 if not annotation_fields: 5254 annotation_fields = {"INFO": None} 5255 5256 log.info(f"Annotations Annovar - database '{annotation}'") 5257 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5258 5259 # Tmp file for annovar 5260 err_files = [] 5261 tmp_annotate_vcf_directory = TemporaryDirectory( 5262 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5263 ) 5264 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5265 tmp_annotate_vcf_name_annovar = ( 5266 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5267 ) 5268 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5269 err_files.append(tmp_annotate_vcf_name_err) 5270 tmp_files.append(tmp_annotate_vcf_name_err) 5271 5272 # Tmp file final vcf annotated by annovar 5273 tmp_annotate_vcf = NamedTemporaryFile( 5274 prefix=self.get_prefix(), 5275 dir=self.get_tmp_dir(), 5276 suffix=".vcf.gz", 5277 delete=False, 5278 ) 5279 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5280 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5281 tmp_files.append(tmp_annotate_vcf_name) 5282 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5283 5284 # Number of fields 5285 annotation_list = [] 5286 annotation_renamed_list = [] 5287 5288 for annotation_field in annotation_fields: 5289 5290 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5291 annotation_fields_new_name = annotation_fields.get( 5292 annotation_field, annotation_field 5293 ) 5294 if not annotation_fields_new_name: 5295 annotation_fields_new_name = annotation_field 5296 5297 if ( 5298 force_update_annotation 5299 or annotation_fields_new_name not in self.get_header().infos 5300 ): 5301 annotation_list.append(annotation_field) 5302 annotation_renamed_list.append(annotation_fields_new_name) 5303 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5304 log.warning( 5305 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5306 ) 5307 5308 # Add rename info 5309 run_parallel_commands( 5310 [ 5311 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5312 ], 5313 1, 5314 ) 5315 5316 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5317 log.debug("annotation_list: " + str(annotation_list)) 5318 5319 # protocol 5320 protocol = annotation 5321 5322 # argument 5323 argument = "" 5324 5325 # operation 5326 operation = "f" 5327 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5328 "ensGene" 5329 ): 5330 operation = "g" 5331 if options.get("genebase", None): 5332 argument = f"""'{options.get("genebase","")}'""" 5333 elif annotation in ["cytoBand"]: 5334 operation = "r" 5335 5336 # argument option 5337 argument_option = "" 5338 if argument != "": 5339 argument_option = " --argument " + argument 5340 5341 # command options 5342 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5343 for option in options: 5344 if option not in ["genebase"]: 5345 command_options += f""" --{option}={options[option]}""" 5346 5347 # Command 5348 5349 # Command - Annovar 5350 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5351 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5352 5353 # Command - start pipe 5354 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5355 5356 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5357 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5358 5359 # Command - Special characters (refGene annotation) 5360 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5361 5362 # Command - Clean empty fields (with value ".") 5363 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5364 5365 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5366 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5367 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5368 # for ann in annotation_renamed_list: 5369 for ann in annotation_list: 5370 annovar_fields_to_keep.append(f"^INFO/{ann}") 5371 5372 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5373 5374 # Command - indexing 5375 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5376 5377 log.debug(f"Annotation - Annovar command: {command_annovar}") 5378 run_parallel_commands([command_annovar], 1) 5379 5380 # Error messages 5381 log.info(f"Error/Warning messages:") 5382 error_message_command_all = [] 5383 error_message_command_warning = [] 5384 error_message_command_err = [] 5385 for err_file in err_files: 5386 with open(err_file, "r") as f: 5387 for line in f: 5388 message = line.strip() 5389 error_message_command_all.append(message) 5390 if line.startswith("[W::") or line.startswith("WARNING"): 5391 error_message_command_warning.append(message) 5392 if line.startswith("[E::") or line.startswith("ERROR"): 5393 error_message_command_err.append( 5394 f"{err_file}: " + message 5395 ) 5396 # log info 5397 for message in list( 5398 set(error_message_command_err + error_message_command_warning) 5399 ): 5400 log.info(f" {message}") 5401 # debug info 5402 for message in list(set(error_message_command_all)): 5403 log.debug(f" {message}") 5404 # failed 5405 if len(error_message_command_err): 5406 log.error("Annotation failed: Error in commands") 5407 raise ValueError("Annotation failed: Error in commands") 5408 5409 if tmp_annotates_vcf_name_list: 5410 5411 # List of annotated files 5412 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5413 5414 # Tmp file 5415 tmp_annotate_vcf = NamedTemporaryFile( 5416 prefix=self.get_prefix(), 5417 dir=self.get_tmp_dir(), 5418 suffix=".vcf.gz", 5419 delete=False, 5420 ) 5421 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5422 tmp_files.append(tmp_annotate_vcf_name) 5423 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5424 err_files.append(tmp_annotate_vcf_name_err) 5425 tmp_files.append(tmp_annotate_vcf_name_err) 5426 5427 # Command merge 5428 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5429 log.info( 5430 f"Annotation Annovar - Annotation merging " 5431 + str(len(tmp_annotates_vcf_name_list)) 5432 + " annotated files" 5433 ) 5434 log.debug(f"Annotation - merge command: {merge_command}") 5435 run_parallel_commands([merge_command], 1) 5436 5437 # Find annotation in header 5438 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5439 header_list = self.read_vcf_header(f) 5440 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5441 5442 for ann in annovar_vcf_header.infos: 5443 if ann not in self.get_header().infos: 5444 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5445 5446 # Update variants 5447 log.info(f"Annotation Annovar - Updating...") 5448 self.update_from_vcf(tmp_annotate_vcf_name) 5449 5450 # Clean files 5451 # Tmp file remove command 5452 if True: 5453 tmp_files_remove_command = "" 5454 if tmp_files: 5455 tmp_files_remove_command = " ".join(tmp_files) 5456 clean_command = f" rm -f {tmp_files_remove_command} " 5457 log.debug(f"Annotation Annovar - Annotation cleaning ") 5458 log.debug(f"Annotation - cleaning command: {clean_command}") 5459 run_parallel_commands([clean_command], 1) 5460 5461 # Parquet 5462 def annotation_parquet(self, threads: int = None) -> None: 5463 """ 5464 It takes a VCF file, and annotates it with a parquet file 5465 5466 :param threads: number of threads to use for the annotation 5467 :return: the value of the variable "result". 5468 """ 5469 5470 # DEBUG 5471 log.debug("Start annotation with parquet databases") 5472 5473 # Threads 5474 if not threads: 5475 threads = self.get_threads() 5476 log.debug("Threads: " + str(threads)) 5477 5478 # DEBUG 5479 delete_tmp = True 5480 if self.get_config().get("verbosity", "warning") in ["debug"]: 5481 delete_tmp = False 5482 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5483 5484 # Config 5485 databases_folders = set( 5486 self.get_config() 5487 .get("folders", {}) 5488 .get("databases", {}) 5489 .get("annotations", ["."]) 5490 + self.get_config() 5491 .get("folders", {}) 5492 .get("databases", {}) 5493 .get("parquet", ["."]) 5494 ) 5495 log.debug("Databases annotations: " + str(databases_folders)) 5496 5497 # Param 5498 annotations = ( 5499 self.get_param() 5500 .get("annotation", {}) 5501 .get("parquet", {}) 5502 .get("annotations", None) 5503 ) 5504 log.debug("Annotations: " + str(annotations)) 5505 5506 # Assembly 5507 assembly = self.get_param().get( 5508 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5509 ) 5510 5511 # Force Update Annotation 5512 force_update_annotation = ( 5513 self.get_param() 5514 .get("annotation", {}) 5515 .get("options", {}) 5516 .get("annotations_update", False) 5517 ) 5518 log.debug(f"force_update_annotation={force_update_annotation}") 5519 force_append_annotation = ( 5520 self.get_param() 5521 .get("annotation", {}) 5522 .get("options", {}) 5523 .get("annotations_append", False) 5524 ) 5525 log.debug(f"force_append_annotation={force_append_annotation}") 5526 5527 # Data 5528 table_variants = self.get_table_variants() 5529 5530 # Check if not empty 5531 log.debug("Check if not empty") 5532 sql_query_chromosomes_df = self.get_query_to_df( 5533 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5534 ) 5535 if not sql_query_chromosomes_df["count"][0]: 5536 log.info(f"VCF empty") 5537 return 5538 5539 # VCF header 5540 vcf_reader = self.get_header() 5541 log.debug("Initial header: " + str(vcf_reader.infos)) 5542 5543 # Nb Variants POS 5544 log.debug("NB Variants Start") 5545 nb_variants = self.conn.execute( 5546 f"SELECT count(*) AS count FROM variants" 5547 ).fetchdf()["count"][0] 5548 log.debug("NB Variants Stop") 5549 5550 # Existing annotations 5551 for vcf_annotation in self.get_header().infos: 5552 5553 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5554 log.debug( 5555 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5556 ) 5557 5558 # Added columns 5559 added_columns = [] 5560 5561 # drop indexes 5562 log.debug(f"Drop indexes...") 5563 self.drop_indexes() 5564 5565 if annotations: 5566 5567 if "ALL" in annotations: 5568 5569 all_param = annotations.get("ALL", {}) 5570 all_param_formats = all_param.get("formats", None) 5571 all_param_releases = all_param.get("releases", None) 5572 5573 databases_infos_dict = self.scan_databases( 5574 database_formats=all_param_formats, 5575 database_releases=all_param_releases, 5576 ) 5577 for database_infos in databases_infos_dict.keys(): 5578 if database_infos not in annotations: 5579 annotations[database_infos] = {"INFO": None} 5580 5581 for annotation in annotations: 5582 5583 if annotation in ["ALL"]: 5584 continue 5585 5586 # Annotation Name 5587 annotation_name = os.path.basename(annotation) 5588 5589 # Annotation fields 5590 annotation_fields = annotations[annotation] 5591 if not annotation_fields: 5592 annotation_fields = {"INFO": None} 5593 5594 log.debug(f"Annotation '{annotation_name}'") 5595 log.debug( 5596 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5597 ) 5598 5599 # Create Database 5600 database = Database( 5601 database=annotation, 5602 databases_folders=databases_folders, 5603 assembly=assembly, 5604 ) 5605 5606 # Find files 5607 parquet_file = database.get_database() 5608 parquet_hdr_file = database.get_header_file() 5609 parquet_type = database.get_type() 5610 5611 # Check if files exists 5612 if not parquet_file or not parquet_hdr_file: 5613 log.error("Annotation failed: file not found") 5614 raise ValueError("Annotation failed: file not found") 5615 else: 5616 # Get parquet connexion 5617 parquet_sql_attach = database.get_sql_database_attach( 5618 output="query" 5619 ) 5620 if parquet_sql_attach: 5621 self.conn.execute(parquet_sql_attach) 5622 parquet_file_link = database.get_sql_database_link() 5623 # Log 5624 log.debug( 5625 f"Annotation '{annotation_name}' - file: " 5626 + str(parquet_file) 5627 + " and " 5628 + str(parquet_hdr_file) 5629 ) 5630 5631 # Database full header columns 5632 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5633 parquet_hdr_file 5634 ) 5635 # Log 5636 log.debug( 5637 "Annotation database header columns : " 5638 + str(parquet_hdr_vcf_header_columns) 5639 ) 5640 5641 # Load header as VCF object 5642 parquet_hdr_vcf_header_infos = database.get_header().infos 5643 # Log 5644 log.debug( 5645 "Annotation database header: " 5646 + str(parquet_hdr_vcf_header_infos) 5647 ) 5648 5649 # Get extra infos 5650 parquet_columns = database.get_extra_columns() 5651 # Log 5652 log.debug("Annotation database Columns: " + str(parquet_columns)) 5653 5654 # Add extra columns if "ALL" in annotation_fields 5655 # if "ALL" in annotation_fields: 5656 # allow_add_extra_column = True 5657 if "ALL" in annotation_fields and database.get_extra_columns(): 5658 for extra_column in database.get_extra_columns(): 5659 if ( 5660 extra_column not in annotation_fields 5661 and extra_column.replace("INFO/", "") 5662 not in parquet_hdr_vcf_header_infos 5663 ): 5664 parquet_hdr_vcf_header_infos[extra_column] = ( 5665 vcf.parser._Info( 5666 extra_column, 5667 ".", 5668 "String", 5669 f"{extra_column} description", 5670 "unknown", 5671 "unknown", 5672 self.code_type_map["String"], 5673 ) 5674 ) 5675 5676 # For all fields in database 5677 annotation_fields_all = False 5678 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5679 annotation_fields_all = True 5680 annotation_fields = { 5681 key: key for key in parquet_hdr_vcf_header_infos 5682 } 5683 5684 log.debug( 5685 "Annotation database header - All annotations added: " 5686 + str(annotation_fields) 5687 ) 5688 5689 # Init 5690 5691 # List of annotation fields to use 5692 sql_query_annotation_update_info_sets = [] 5693 5694 # List of annotation to agregate 5695 sql_query_annotation_to_agregate = [] 5696 5697 # Number of fields 5698 nb_annotation_field = 0 5699 5700 # Annotation fields processed 5701 annotation_fields_processed = [] 5702 5703 # Columns mapping 5704 map_columns = database.map_columns( 5705 columns=annotation_fields, prefixes=["INFO/"] 5706 ) 5707 5708 # Query dict for fields to remove (update option) 5709 query_dict_remove = {} 5710 5711 # Fetch Anotation fields 5712 for annotation_field in annotation_fields: 5713 5714 # annotation_field_column 5715 annotation_field_column = map_columns.get( 5716 annotation_field, "INFO" 5717 ) 5718 5719 # field new name, if parametered 5720 annotation_fields_new_name = annotation_fields.get( 5721 annotation_field, annotation_field 5722 ) 5723 if not annotation_fields_new_name: 5724 annotation_fields_new_name = annotation_field 5725 5726 # To annotate 5727 # force_update_annotation = True 5728 # force_append_annotation = True 5729 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5730 if annotation_field in parquet_hdr_vcf_header_infos and ( 5731 force_update_annotation 5732 or force_append_annotation 5733 or ( 5734 annotation_fields_new_name 5735 not in self.get_header().infos 5736 ) 5737 ): 5738 5739 # Add field to annotation to process list 5740 annotation_fields_processed.append( 5741 annotation_fields_new_name 5742 ) 5743 5744 # explode infos for the field 5745 annotation_fields_new_name_info_msg = "" 5746 if ( 5747 force_update_annotation 5748 and annotation_fields_new_name 5749 in self.get_header().infos 5750 ): 5751 # Remove field from INFO 5752 query = f""" 5753 UPDATE {table_variants} as table_variants 5754 SET INFO = REGEXP_REPLACE( 5755 concat(table_variants.INFO,''), 5756 ';*{annotation_fields_new_name}=[^;]*', 5757 '' 5758 ) 5759 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5760 """ 5761 annotation_fields_new_name_info_msg = " [update]" 5762 query_dict_remove[ 5763 f"remove 'INFO/{annotation_fields_new_name}'" 5764 ] = query 5765 5766 # Sep between fields in INFO 5767 nb_annotation_field += 1 5768 if nb_annotation_field > 1: 5769 annotation_field_sep = ";" 5770 else: 5771 annotation_field_sep = "" 5772 5773 log.info( 5774 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5775 ) 5776 5777 # Add INFO field to header 5778 parquet_hdr_vcf_header_infos_number = ( 5779 parquet_hdr_vcf_header_infos[annotation_field].num 5780 or "." 5781 ) 5782 parquet_hdr_vcf_header_infos_type = ( 5783 parquet_hdr_vcf_header_infos[annotation_field].type 5784 or "String" 5785 ) 5786 parquet_hdr_vcf_header_infos_description = ( 5787 parquet_hdr_vcf_header_infos[annotation_field].desc 5788 or f"{annotation_field} description" 5789 ) 5790 parquet_hdr_vcf_header_infos_source = ( 5791 parquet_hdr_vcf_header_infos[annotation_field].source 5792 or "unknown" 5793 ) 5794 parquet_hdr_vcf_header_infos_version = ( 5795 parquet_hdr_vcf_header_infos[annotation_field].version 5796 or "unknown" 5797 ) 5798 5799 vcf_reader.infos[annotation_fields_new_name] = ( 5800 vcf.parser._Info( 5801 annotation_fields_new_name, 5802 parquet_hdr_vcf_header_infos_number, 5803 parquet_hdr_vcf_header_infos_type, 5804 parquet_hdr_vcf_header_infos_description, 5805 parquet_hdr_vcf_header_infos_source, 5806 parquet_hdr_vcf_header_infos_version, 5807 self.code_type_map[ 5808 parquet_hdr_vcf_header_infos_type 5809 ], 5810 ) 5811 ) 5812 5813 # Append 5814 if force_append_annotation: 5815 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5816 else: 5817 query_case_when_append = "" 5818 5819 # Annotation/Update query fields 5820 # Found in INFO column 5821 if ( 5822 annotation_field_column == "INFO" 5823 and "INFO" in parquet_hdr_vcf_header_columns 5824 ): 5825 sql_query_annotation_update_info_sets.append( 5826 f""" 5827 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5828 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5829 ELSE '' 5830 END 5831 """ 5832 ) 5833 # Found in a specific column 5834 else: 5835 sql_query_annotation_update_info_sets.append( 5836 f""" 5837 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 5838 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 5839 ELSE '' 5840 END 5841 """ 5842 ) 5843 sql_query_annotation_to_agregate.append( 5844 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5845 ) 5846 5847 # Not to annotate 5848 else: 5849 5850 if force_update_annotation: 5851 annotation_message = "forced" 5852 else: 5853 annotation_message = "skipped" 5854 5855 if annotation_field not in parquet_hdr_vcf_header_infos: 5856 log.warning( 5857 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5858 ) 5859 if annotation_fields_new_name in self.get_header().infos: 5860 log.warning( 5861 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5862 ) 5863 5864 # Check if ALL fields have to be annotated. Thus concat all INFO field 5865 # allow_annotation_full_info = True 5866 allow_annotation_full_info = not force_append_annotation 5867 5868 if parquet_type in ["regions"]: 5869 allow_annotation_full_info = False 5870 5871 if ( 5872 allow_annotation_full_info 5873 and nb_annotation_field == len(annotation_fields) 5874 and annotation_fields_all 5875 and ( 5876 "INFO" in parquet_hdr_vcf_header_columns 5877 and "INFO" in database.get_extra_columns() 5878 ) 5879 ): 5880 log.debug("Column INFO annotation enabled") 5881 sql_query_annotation_update_info_sets = [] 5882 sql_query_annotation_update_info_sets.append( 5883 f" table_parquet.INFO " 5884 ) 5885 5886 if sql_query_annotation_update_info_sets: 5887 5888 # Annotate 5889 log.info(f"Annotation '{annotation_name}' - Annotation...") 5890 5891 # Join query annotation update info sets for SQL 5892 sql_query_annotation_update_info_sets_sql = ",".join( 5893 sql_query_annotation_update_info_sets 5894 ) 5895 5896 # Check chromosomes list (and variants infos) 5897 sql_query_chromosomes = f""" 5898 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5899 FROM {table_variants} as table_variants 5900 GROUP BY table_variants."#CHROM" 5901 ORDER BY table_variants."#CHROM" 5902 """ 5903 sql_query_chromosomes_df = self.conn.execute( 5904 sql_query_chromosomes 5905 ).df() 5906 sql_query_chromosomes_dict = { 5907 entry["CHROM"]: { 5908 "count": entry["count_variants"], 5909 "min": entry["min_variants"], 5910 "max": entry["max_variants"], 5911 } 5912 for index, entry in sql_query_chromosomes_df.iterrows() 5913 } 5914 5915 # Init 5916 nb_of_query = 0 5917 nb_of_variant_annotated = 0 5918 query_dict = query_dict_remove 5919 5920 # for chrom in sql_query_chromosomes_df["CHROM"]: 5921 for chrom in sql_query_chromosomes_dict: 5922 5923 # Number of variant by chromosome 5924 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5925 chrom, {} 5926 ).get("count", 0) 5927 5928 log.debug( 5929 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5930 ) 5931 5932 # Annotation with regions database 5933 if parquet_type in ["regions"]: 5934 sql_query_annotation_from_clause = f""" 5935 FROM ( 5936 SELECT 5937 '{chrom}' AS \"#CHROM\", 5938 table_variants_from.\"POS\" AS \"POS\", 5939 {",".join(sql_query_annotation_to_agregate)} 5940 FROM {table_variants} as table_variants_from 5941 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5942 table_parquet_from."#CHROM" = '{chrom}' 5943 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5944 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5945 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5946 ) 5947 ) 5948 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5949 GROUP BY table_variants_from.\"POS\" 5950 ) 5951 as table_parquet 5952 """ 5953 5954 sql_query_annotation_where_clause = """ 5955 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5956 AND table_parquet.\"POS\" = table_variants.\"POS\" 5957 """ 5958 5959 # Annotation with variants database 5960 else: 5961 sql_query_annotation_from_clause = f""" 5962 FROM {parquet_file_link} as table_parquet 5963 """ 5964 sql_query_annotation_where_clause = f""" 5965 table_variants."#CHROM" = '{chrom}' 5966 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5967 AND table_parquet.\"POS\" = table_variants.\"POS\" 5968 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5969 AND table_parquet.\"REF\" = table_variants.\"REF\" 5970 """ 5971 5972 # Create update query 5973 sql_query_annotation_chrom_interval_pos = f""" 5974 UPDATE {table_variants} as table_variants 5975 SET INFO = 5976 concat( 5977 CASE WHEN table_variants.INFO NOT IN ('','.') 5978 THEN table_variants.INFO 5979 ELSE '' 5980 END 5981 , 5982 CASE WHEN table_variants.INFO NOT IN ('','.') 5983 AND ( 5984 concat({sql_query_annotation_update_info_sets_sql}) 5985 ) 5986 NOT IN ('','.') 5987 THEN ';' 5988 ELSE '' 5989 END 5990 , 5991 {sql_query_annotation_update_info_sets_sql} 5992 ) 5993 {sql_query_annotation_from_clause} 5994 WHERE {sql_query_annotation_where_clause} 5995 ; 5996 """ 5997 5998 # Add update query to dict 5999 query_dict[ 6000 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6001 ] = sql_query_annotation_chrom_interval_pos 6002 6003 nb_of_query = len(query_dict) 6004 num_query = 0 6005 6006 # SET max_expression_depth TO x 6007 self.conn.execute("SET max_expression_depth TO 10000") 6008 6009 for query_name in query_dict: 6010 query = query_dict[query_name] 6011 num_query += 1 6012 log.info( 6013 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6014 ) 6015 result = self.conn.execute(query) 6016 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6017 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6018 log.info( 6019 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6020 ) 6021 6022 log.info( 6023 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6024 ) 6025 6026 else: 6027 6028 log.info( 6029 f"Annotation '{annotation_name}' - No Annotations available" 6030 ) 6031 6032 log.debug("Final header: " + str(vcf_reader.infos)) 6033 6034 # Remove added columns 6035 for added_column in added_columns: 6036 self.drop_column(column=added_column) 6037 6038 def annotation_splice(self, threads: int = None) -> None: 6039 """ 6040 This function annotate with snpEff 6041 6042 :param threads: The number of threads to use 6043 :return: the value of the variable "return_value". 6044 """ 6045 6046 # DEBUG 6047 log.debug("Start annotation with splice tools") 6048 6049 # Threads 6050 if not threads: 6051 threads = self.get_threads() 6052 log.debug("Threads: " + str(threads)) 6053 6054 # DEBUG 6055 delete_tmp = True 6056 if self.get_config().get("verbosity", "warning") in ["debug"]: 6057 delete_tmp = False 6058 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6059 6060 # Config 6061 config = self.get_config() 6062 log.debug("Config: " + str(config)) 6063 splice_config = config.get("tools", {}).get("splice", {}) 6064 if not splice_config: 6065 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6066 if not splice_config: 6067 msg_err = "No Splice tool config" 6068 log.error(msg_err) 6069 raise ValueError(msg_err) 6070 log.debug(f"splice_config={splice_config}") 6071 6072 # Config - Folders - Databases 6073 databases_folders = ( 6074 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6075 ) 6076 log.debug("Databases annotations: " + str(databases_folders)) 6077 6078 # Splice docker image 6079 splice_docker_image = splice_config.get("docker").get("image") 6080 6081 # Pull splice image if it's not already there 6082 if not check_docker_image_exists(splice_docker_image): 6083 log.warning( 6084 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6085 ) 6086 try: 6087 command(f"docker pull {splice_config.get('docker').get('image')}") 6088 except subprocess.CalledProcessError: 6089 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6090 log.error(msg_err) 6091 raise ValueError(msg_err) 6092 return None 6093 6094 # Config - splice databases 6095 splice_databases = ( 6096 config.get("folders", {}) 6097 .get("databases", {}) 6098 .get("splice", DEFAULT_SPLICE_FOLDER) 6099 ) 6100 splice_databases = full_path(splice_databases) 6101 6102 # Param 6103 param = self.get_param() 6104 log.debug("Param: " + str(param)) 6105 6106 # Param 6107 options = param.get("annotation", {}).get("splice", {}) 6108 log.debug("Options: " + str(options)) 6109 6110 # Data 6111 table_variants = self.get_table_variants() 6112 6113 # Check if not empty 6114 log.debug("Check if not empty") 6115 sql_query_chromosomes = ( 6116 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6117 ) 6118 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6119 log.info("VCF empty") 6120 return None 6121 6122 # Export in VCF 6123 log.debug("Create initial file to annotate") 6124 6125 # Create output folder 6126 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6127 if not os.path.exists(output_folder): 6128 Path(output_folder).mkdir(parents=True, exist_ok=True) 6129 6130 # Create tmp VCF file 6131 tmp_vcf = NamedTemporaryFile( 6132 prefix=self.get_prefix(), 6133 dir=output_folder, 6134 suffix=".vcf", 6135 delete=False, 6136 ) 6137 tmp_vcf_name = tmp_vcf.name 6138 6139 # VCF header 6140 header = self.get_header() 6141 6142 # Existing annotations 6143 for vcf_annotation in self.get_header().infos: 6144 6145 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6146 log.debug( 6147 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6148 ) 6149 6150 # Memory limit 6151 if config.get("memory", None): 6152 memory_limit = config.get("memory", "8G").upper() 6153 # upper() 6154 else: 6155 memory_limit = "8G" 6156 log.debug(f"memory_limit: {memory_limit}") 6157 6158 # Check number of variants to annotate 6159 where_clause_regex_spliceai = r"SpliceAI_\w+" 6160 where_clause_regex_spip = r"SPiP_\w+" 6161 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6162 df_list_of_variants_to_annotate = self.get_query_to_df( 6163 query=f""" SELECT * FROM variants {where_clause} """ 6164 ) 6165 if len(df_list_of_variants_to_annotate) == 0: 6166 log.warning( 6167 f"No variants to annotate with splice. Variants probably already annotated with splice" 6168 ) 6169 return None 6170 else: 6171 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6172 6173 # Export VCF file 6174 self.export_variant_vcf( 6175 vcf_file=tmp_vcf_name, 6176 remove_info=True, 6177 add_samples=True, 6178 index=False, 6179 where_clause=where_clause, 6180 ) 6181 6182 # Create docker container and launch splice analysis 6183 if splice_config: 6184 6185 # Splice mount folders 6186 mount_folders = splice_config.get("mount", {}) 6187 6188 # Genome mount 6189 mount_folders[ 6190 config.get("folders", {}) 6191 .get("databases", {}) 6192 .get("genomes", DEFAULT_GENOME_FOLDER) 6193 ] = "ro" 6194 6195 # SpliceAI mount 6196 mount_folders[ 6197 config.get("folders", {}) 6198 .get("databases", {}) 6199 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6200 ] = "ro" 6201 6202 # Genome mount 6203 mount_folders[ 6204 config.get("folders", {}) 6205 .get("databases", {}) 6206 .get("spip", DEFAULT_SPIP_FOLDER) 6207 ] = "ro" 6208 6209 # Mount folders 6210 mount = [] 6211 6212 # Config mount 6213 mount = [ 6214 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6215 for path, mode in mount_folders.items() 6216 ] 6217 6218 if any(value for value in splice_config.values() if value is None): 6219 log.warning("At least one splice config parameter is empty") 6220 return None 6221 6222 # Params in splice nf 6223 def check_values(dico: dict): 6224 """ 6225 Ensure parameters for NF splice pipeline 6226 """ 6227 for key, val in dico.items(): 6228 if key == "genome": 6229 if any( 6230 assemb in options.get("genome", {}) 6231 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6232 ): 6233 yield f"--{key} hg19" 6234 elif any( 6235 assemb in options.get("genome", {}) 6236 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6237 ): 6238 yield f"--{key} hg38" 6239 elif ( 6240 (isinstance(val, str) and val) 6241 or isinstance(val, int) 6242 or isinstance(val, bool) 6243 ): 6244 yield f"--{key} {val}" 6245 6246 # Genome 6247 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6248 options["genome"] = genome 6249 6250 # NF params 6251 nf_params = [] 6252 6253 # Add options 6254 if options: 6255 nf_params = list(check_values(options)) 6256 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6257 else: 6258 log.debug("No NF params provided") 6259 6260 # Add threads 6261 if "threads" not in options.keys(): 6262 nf_params.append(f"--threads {threads}") 6263 6264 # Genome path 6265 genome_path = find_genome( 6266 config.get("folders", {}) 6267 .get("databases", {}) 6268 .get("genomes", DEFAULT_GENOME_FOLDER), 6269 file=f"{genome}.fa", 6270 ) 6271 # Add genome path 6272 if not genome_path: 6273 raise ValueError( 6274 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6275 ) 6276 else: 6277 log.debug(f"Genome: {genome_path}") 6278 nf_params.append(f"--genome_path {genome_path}") 6279 6280 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6281 """ 6282 Setting up updated databases for SPiP and SpliceAI 6283 """ 6284 6285 try: 6286 6287 # SpliceAI assembly transcriptome 6288 spliceai_assembly = os.path.join( 6289 config.get("folders", {}) 6290 .get("databases", {}) 6291 .get("spliceai", {}), 6292 options.get("genome"), 6293 "transcriptome", 6294 ) 6295 spip_assembly = options.get("genome") 6296 6297 spip = find( 6298 f"transcriptome_{spip_assembly}.RData", 6299 config.get("folders", {}).get("databases", {}).get("spip", {}), 6300 ) 6301 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6302 log.debug(f"SPiP annotations: {spip}") 6303 log.debug(f"SpliceAI annotations: {spliceai}") 6304 if spip and spliceai: 6305 return [ 6306 f"--spip_transcriptome {spip}", 6307 f"--spliceai_annotations {spliceai}", 6308 ] 6309 else: 6310 # TODO crash and go on with basic annotations ? 6311 # raise ValueError( 6312 # "Can't find splice databases in configuration EXIT" 6313 # ) 6314 log.warning( 6315 "Can't find splice databases in configuration, use annotations file from image" 6316 ) 6317 except TypeError: 6318 log.warning( 6319 "Can't find splice databases in configuration, use annotations file from image" 6320 ) 6321 return [] 6322 6323 # Add options, check if transcriptome option have already beend provided 6324 if ( 6325 "spip_transcriptome" not in nf_params 6326 and "spliceai_transcriptome" not in nf_params 6327 ): 6328 splice_reference = splice_annotations(options, config) 6329 if splice_reference: 6330 nf_params.extend(splice_reference) 6331 6332 nf_params.append(f"--output_folder {output_folder}") 6333 6334 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6335 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6336 log.debug(cmd) 6337 6338 splice_config["docker"]["command"] = cmd 6339 6340 docker_cmd = get_bin_command( 6341 tool="splice", 6342 bin_type="docker", 6343 config=config, 6344 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6345 add_options=f"--name {random_uuid} {' '.join(mount)}", 6346 ) 6347 6348 # Docker debug 6349 # if splice_config.get("rm_container"): 6350 # rm_container = "--rm" 6351 # else: 6352 # rm_container = "" 6353 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6354 6355 log.debug(docker_cmd) 6356 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6357 log.debug(res.stdout) 6358 if res.stderr: 6359 log.error(res.stderr) 6360 res.check_returncode() 6361 else: 6362 log.warning(f"Splice tool configuration not found: {config}") 6363 6364 # Update variants 6365 log.info("Annotation - Updating...") 6366 # Test find output vcf 6367 log.debug( 6368 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6369 ) 6370 output_vcf = [] 6371 # Wrong folder to look in 6372 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6373 if ( 6374 files 6375 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6376 ): 6377 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6378 # log.debug(os.listdir(options.get("output_folder"))) 6379 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6380 if not output_vcf: 6381 log.debug( 6382 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6383 ) 6384 else: 6385 # Get new header from annotated vcf 6386 log.debug(f"Initial header: {len(header.infos)} fields") 6387 # Create new header with splice infos 6388 new_vcf = Variants(input=output_vcf[0]) 6389 new_vcf_header = new_vcf.get_header().infos 6390 for keys, infos in new_vcf_header.items(): 6391 if keys not in header.infos.keys(): 6392 header.infos[keys] = infos 6393 log.debug(f"New header: {len(header.infos)} fields") 6394 log.debug(f"Splice tmp output: {output_vcf[0]}") 6395 self.update_from_vcf(output_vcf[0]) 6396 6397 # Remove folder 6398 remove_if_exists(output_folder) 6399 6400 ### 6401 # Prioritization 6402 ### 6403 6404 def get_config_default(self, name: str) -> dict: 6405 """ 6406 The function `get_config_default` returns a dictionary containing default configurations for 6407 various calculations and prioritizations. 6408 6409 :param name: The `get_config_default` function returns a dictionary containing default 6410 configurations for different calculations and prioritizations. The `name` parameter is used to 6411 specify which specific configuration to retrieve from the dictionary 6412 :type name: str 6413 :return: The function `get_config_default` returns a dictionary containing default configuration 6414 settings for different calculations and prioritizations. The specific configuration settings are 6415 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6416 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6417 returned. If there is no match, an empty dictionary is returned. 6418 """ 6419 6420 config_default = { 6421 "calculations": { 6422 "variant_chr_pos_alt_ref": { 6423 "type": "sql", 6424 "name": "variant_chr_pos_alt_ref", 6425 "description": "Create a variant ID with chromosome, position, alt and ref", 6426 "available": False, 6427 "output_column_name": "variant_chr_pos_alt_ref", 6428 "output_column_type": "String", 6429 "output_column_description": "variant ID with chromosome, position, alt and ref", 6430 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6431 "operation_info": True, 6432 }, 6433 "VARTYPE": { 6434 "type": "sql", 6435 "name": "VARTYPE", 6436 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6437 "available": True, 6438 "output_column_name": "VARTYPE", 6439 "output_column_type": "String", 6440 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6441 "operation_query": """ 6442 CASE 6443 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6444 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6445 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6446 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6447 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6448 ELSE 'UNDEFINED' 6449 END 6450 """, 6451 "info_fields": ["SVTYPE"], 6452 "operation_info": True, 6453 }, 6454 "snpeff_hgvs": { 6455 "type": "python", 6456 "name": "snpeff_hgvs", 6457 "description": "HGVS nomenclatures from snpEff annotation", 6458 "available": True, 6459 "function_name": "calculation_extract_snpeff_hgvs", 6460 "function_params": ["snpeff_hgvs", "ANN"], 6461 }, 6462 "snpeff_ann_explode": { 6463 "type": "python", 6464 "name": "snpeff_ann_explode", 6465 "description": "Explode snpEff annotations with uniquify values", 6466 "available": True, 6467 "function_name": "calculation_snpeff_ann_explode", 6468 "function_params": [False, "fields", "snpeff_", "ANN"], 6469 }, 6470 "snpeff_ann_explode_uniquify": { 6471 "type": "python", 6472 "name": "snpeff_ann_explode_uniquify", 6473 "description": "Explode snpEff annotations", 6474 "available": True, 6475 "function_name": "calculation_snpeff_ann_explode", 6476 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6477 }, 6478 "snpeff_ann_explode_json": { 6479 "type": "python", 6480 "name": "snpeff_ann_explode_json", 6481 "description": "Explode snpEff annotations in JSON format", 6482 "available": True, 6483 "function_name": "calculation_snpeff_ann_explode", 6484 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6485 }, 6486 "NOMEN": { 6487 "type": "python", 6488 "name": "NOMEN", 6489 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6490 "available": True, 6491 "function_name": "calculation_extract_nomen", 6492 "function_params": [], 6493 }, 6494 "FINDBYPIPELINE": { 6495 "type": "python", 6496 "name": "FINDBYPIPELINE", 6497 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6498 "available": True, 6499 "function_name": "calculation_find_by_pipeline", 6500 "function_params": ["findbypipeline"], 6501 }, 6502 "FINDBYSAMPLE": { 6503 "type": "python", 6504 "name": "FINDBYSAMPLE", 6505 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6506 "available": True, 6507 "function_name": "calculation_find_by_pipeline", 6508 "function_params": ["findbysample"], 6509 }, 6510 "GENOTYPECONCORDANCE": { 6511 "type": "python", 6512 "name": "GENOTYPECONCORDANCE", 6513 "description": "Concordance of genotype for multi caller VCF", 6514 "available": True, 6515 "function_name": "calculation_genotype_concordance", 6516 "function_params": [], 6517 }, 6518 "BARCODE": { 6519 "type": "python", 6520 "name": "BARCODE", 6521 "description": "BARCODE as VaRank tool", 6522 "available": True, 6523 "function_name": "calculation_barcode", 6524 "function_params": [], 6525 }, 6526 "BARCODEFAMILY": { 6527 "type": "python", 6528 "name": "BARCODEFAMILY", 6529 "description": "BARCODEFAMILY as VaRank tool", 6530 "available": True, 6531 "function_name": "calculation_barcode_family", 6532 "function_params": ["BCF"], 6533 }, 6534 "TRIO": { 6535 "type": "python", 6536 "name": "TRIO", 6537 "description": "Inheritance for a trio family", 6538 "available": True, 6539 "function_name": "calculation_trio", 6540 "function_params": [], 6541 }, 6542 "VAF": { 6543 "type": "python", 6544 "name": "VAF", 6545 "description": "Variant Allele Frequency (VAF) harmonization", 6546 "available": True, 6547 "function_name": "calculation_vaf_normalization", 6548 "function_params": [], 6549 }, 6550 "VAF_stats": { 6551 "type": "python", 6552 "name": "VAF_stats", 6553 "description": "Variant Allele Frequency (VAF) statistics", 6554 "available": True, 6555 "function_name": "calculation_genotype_stats", 6556 "function_params": ["VAF"], 6557 }, 6558 "DP_stats": { 6559 "type": "python", 6560 "name": "DP_stats", 6561 "description": "Depth (DP) statistics", 6562 "available": True, 6563 "function_name": "calculation_genotype_stats", 6564 "function_params": ["DP"], 6565 }, 6566 "variant_id": { 6567 "type": "python", 6568 "name": "variant_id", 6569 "description": "Variant ID generated from variant position and type", 6570 "available": True, 6571 "function_name": "calculation_variant_id", 6572 "function_params": [], 6573 }, 6574 "transcripts_json": { 6575 "type": "python", 6576 "name": "transcripts_json", 6577 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6578 "available": True, 6579 "function_name": "calculation_transcripts_annotation", 6580 "function_params": ["transcripts_json", None], 6581 }, 6582 "transcripts_ann": { 6583 "type": "python", 6584 "name": "transcripts_ann", 6585 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6586 "available": True, 6587 "function_name": "calculation_transcripts_annotation", 6588 "function_params": [None, "transcripts_ann"], 6589 }, 6590 "transcripts_annotations": { 6591 "type": "python", 6592 "name": "transcripts_annotations", 6593 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6594 "available": True, 6595 "function_name": "calculation_transcripts_annotation", 6596 "function_params": [None, None], 6597 }, 6598 "transcripts_prioritization": { 6599 "type": "python", 6600 "name": "transcripts_prioritization", 6601 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6602 "available": True, 6603 "function_name": "calculation_transcripts_prioritization", 6604 "function_params": [], 6605 }, 6606 }, 6607 "prioritizations": { 6608 "default": { 6609 "ANN2": [ 6610 { 6611 "type": "contains", 6612 "value": "HIGH", 6613 "score": 5, 6614 "flag": "PASS", 6615 "comment": [ 6616 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6617 ], 6618 }, 6619 { 6620 "type": "contains", 6621 "value": "MODERATE", 6622 "score": 3, 6623 "flag": "PASS", 6624 "comment": [ 6625 "A non-disruptive variant that might change protein effectiveness" 6626 ], 6627 }, 6628 { 6629 "type": "contains", 6630 "value": "LOW", 6631 "score": 0, 6632 "flag": "FILTERED", 6633 "comment": [ 6634 "Assumed to be mostly harmless or unlikely to change protein behavior" 6635 ], 6636 }, 6637 { 6638 "type": "contains", 6639 "value": "MODIFIER", 6640 "score": 0, 6641 "flag": "FILTERED", 6642 "comment": [ 6643 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6644 ], 6645 }, 6646 ], 6647 } 6648 }, 6649 } 6650 6651 return config_default.get(name, None) 6652 6653 def get_config_json( 6654 self, name: str, config_dict: dict = {}, config_file: str = None 6655 ) -> dict: 6656 """ 6657 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6658 default values, a dictionary, and a file. 6659 6660 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6661 the name of the configuration. It is used to identify and retrieve the configuration settings 6662 for a specific component or module 6663 :type name: str 6664 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6665 dictionary that allows you to provide additional configuration settings or overrides. When you 6666 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6667 the key is the configuration setting you want to override or 6668 :type config_dict: dict 6669 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6670 specify the path to a configuration file that contains additional settings. If provided, the 6671 function will read the contents of this file and update the configuration dictionary with the 6672 values found in the file, overriding any existing values with the 6673 :type config_file: str 6674 :return: The function `get_config_json` returns a dictionary containing the configuration 6675 settings. 6676 """ 6677 6678 # Create with default prioritizations 6679 config_default = self.get_config_default(name=name) 6680 configuration = config_default 6681 # log.debug(f"configuration={configuration}") 6682 6683 # Replace prioritizations from dict 6684 for config in config_dict: 6685 configuration[config] = config_dict[config] 6686 6687 # Replace prioritizations from file 6688 config_file = full_path(config_file) 6689 if config_file: 6690 if os.path.exists(config_file): 6691 with open(config_file) as config_file_content: 6692 config_file_dict = json.load(config_file_content) 6693 for config in config_file_dict: 6694 configuration[config] = config_file_dict[config] 6695 else: 6696 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6697 log.error(msg_error) 6698 raise ValueError(msg_error) 6699 6700 return configuration 6701 6702 def prioritization( 6703 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6704 ) -> bool: 6705 """ 6706 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6707 prioritizes variants based on configured profiles and criteria. 6708 6709 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6710 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6711 a table name is provided, the method will prioritize the variants in that specific table 6712 :type table: str 6713 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6714 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6715 provided, the code will use a default prefix value of "PZ" 6716 :type pz_prefix: str 6717 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6718 additional parameters specific to the prioritization process. These parameters can include 6719 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6720 configurations needed for the prioritization of variants in a V 6721 :type pz_param: dict 6722 :return: A boolean value (True) is being returned from the `prioritization` function. 6723 """ 6724 6725 # Config 6726 config = self.get_config() 6727 6728 # Param 6729 param = self.get_param() 6730 6731 # Prioritization param 6732 if pz_param is not None: 6733 prioritization_param = pz_param 6734 else: 6735 prioritization_param = param.get("prioritization", {}) 6736 6737 # Configuration profiles 6738 prioritization_config_file = prioritization_param.get( 6739 "prioritization_config", None 6740 ) 6741 prioritization_config_file = full_path(prioritization_config_file) 6742 prioritizations_config = self.get_config_json( 6743 name="prioritizations", config_file=prioritization_config_file 6744 ) 6745 6746 # Prioritization prefix 6747 pz_prefix_default = "PZ" 6748 if pz_prefix is None: 6749 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6750 6751 # Prioritization options 6752 profiles = prioritization_param.get("profiles", []) 6753 if isinstance(profiles, str): 6754 profiles = profiles.split(",") 6755 pzfields = prioritization_param.get( 6756 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6757 ) 6758 if isinstance(pzfields, str): 6759 pzfields = pzfields.split(",") 6760 default_profile = prioritization_param.get("default_profile", None) 6761 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6762 prioritization_score_mode = prioritization_param.get( 6763 "prioritization_score_mode", "HOWARD" 6764 ) 6765 6766 # Quick Prioritizations 6767 prioritizations = param.get("prioritizations", None) 6768 if prioritizations: 6769 log.info("Quick Prioritization:") 6770 for profile in prioritizations.split(","): 6771 if profile not in profiles: 6772 profiles.append(profile) 6773 log.info(f" {profile}") 6774 6775 # If profile "ALL" provided, all profiles in the config profiles 6776 if "ALL" in profiles: 6777 profiles = list(prioritizations_config.keys()) 6778 6779 for profile in profiles: 6780 if prioritizations_config.get(profile, None): 6781 log.debug(f"Profile '{profile}' configured") 6782 else: 6783 msg_error = f"Profile '{profile}' NOT configured" 6784 log.error(msg_error) 6785 raise ValueError(msg_error) 6786 6787 if profiles: 6788 log.info(f"Prioritization... ") 6789 else: 6790 log.debug(f"No profile defined") 6791 return False 6792 6793 if not default_profile and len(profiles): 6794 default_profile = profiles[0] 6795 6796 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6797 log.debug("Profiles to check: " + str(list(profiles))) 6798 6799 # Variables 6800 if table is not None: 6801 table_variants = table 6802 else: 6803 table_variants = self.get_table_variants(clause="update") 6804 log.debug(f"Table to prioritize: {table_variants}") 6805 6806 # Added columns 6807 added_columns = [] 6808 6809 # Create list of PZfields 6810 # List of PZFields 6811 list_of_pzfields_original = pzfields + [ 6812 pzfield + pzfields_sep + profile 6813 for pzfield in pzfields 6814 for profile in profiles 6815 ] 6816 list_of_pzfields = [] 6817 log.debug(f"{list_of_pzfields_original}") 6818 6819 # Remove existing PZfields to use if exists 6820 for pzfield in list_of_pzfields_original: 6821 if self.get_header().infos.get(pzfield, None) is None: 6822 list_of_pzfields.append(pzfield) 6823 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6824 else: 6825 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6826 6827 if list_of_pzfields: 6828 6829 # Explode Infos prefix 6830 explode_infos_prefix = self.get_explode_infos_prefix() 6831 6832 # PZfields tags description 6833 PZfields_INFOS = { 6834 f"{pz_prefix}Tags": { 6835 "ID": f"{pz_prefix}Tags", 6836 "Number": ".", 6837 "Type": "String", 6838 "Description": "Variant tags based on annotation criteria", 6839 }, 6840 f"{pz_prefix}Score": { 6841 "ID": f"{pz_prefix}Score", 6842 "Number": 1, 6843 "Type": "Integer", 6844 "Description": "Variant score based on annotation criteria", 6845 }, 6846 f"{pz_prefix}Flag": { 6847 "ID": f"{pz_prefix}Flag", 6848 "Number": 1, 6849 "Type": "String", 6850 "Description": "Variant flag based on annotation criteria", 6851 }, 6852 f"{pz_prefix}Comment": { 6853 "ID": f"{pz_prefix}Comment", 6854 "Number": ".", 6855 "Type": "String", 6856 "Description": "Variant comment based on annotation criteria", 6857 }, 6858 f"{pz_prefix}Infos": { 6859 "ID": f"{pz_prefix}Infos", 6860 "Number": ".", 6861 "Type": "String", 6862 "Description": "Variant infos based on annotation criteria", 6863 }, 6864 f"{pz_prefix}Class": { 6865 "ID": f"{pz_prefix}Class", 6866 "Number": ".", 6867 "Type": "String", 6868 "Description": "Variant class based on annotation criteria", 6869 }, 6870 } 6871 6872 # Create INFO fields if not exist 6873 for field in PZfields_INFOS: 6874 field_ID = PZfields_INFOS[field]["ID"] 6875 field_description = PZfields_INFOS[field]["Description"] 6876 if field_ID not in self.get_header().infos and field_ID in pzfields: 6877 field_description = ( 6878 PZfields_INFOS[field]["Description"] 6879 + f", profile {default_profile}" 6880 ) 6881 self.get_header().infos[field_ID] = vcf.parser._Info( 6882 field_ID, 6883 PZfields_INFOS[field]["Number"], 6884 PZfields_INFOS[field]["Type"], 6885 field_description, 6886 "unknown", 6887 "unknown", 6888 code_type_map[PZfields_INFOS[field]["Type"]], 6889 ) 6890 6891 # Create INFO fields if not exist for each profile 6892 for profile in prioritizations_config: 6893 if profile in profiles or profiles == []: 6894 for field in PZfields_INFOS: 6895 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6896 field_description = ( 6897 PZfields_INFOS[field]["Description"] 6898 + f", profile {profile}" 6899 ) 6900 if ( 6901 field_ID not in self.get_header().infos 6902 and field in pzfields 6903 ): 6904 self.get_header().infos[field_ID] = vcf.parser._Info( 6905 field_ID, 6906 PZfields_INFOS[field]["Number"], 6907 PZfields_INFOS[field]["Type"], 6908 field_description, 6909 "unknown", 6910 "unknown", 6911 code_type_map[PZfields_INFOS[field]["Type"]], 6912 ) 6913 6914 # Header 6915 for pzfield in list_of_pzfields: 6916 if re.match(f"{pz_prefix}Score.*", pzfield): 6917 added_column = self.add_column( 6918 table_name=table_variants, 6919 column_name=pzfield, 6920 column_type="INTEGER", 6921 default_value="0", 6922 ) 6923 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6924 added_column = self.add_column( 6925 table_name=table_variants, 6926 column_name=pzfield, 6927 column_type="BOOLEAN", 6928 default_value="1", 6929 ) 6930 elif re.match(f"{pz_prefix}Class.*", pzfield): 6931 added_column = self.add_column( 6932 table_name=table_variants, 6933 column_name=pzfield, 6934 column_type="VARCHAR[]", 6935 default_value="null", 6936 ) 6937 else: 6938 added_column = self.add_column( 6939 table_name=table_variants, 6940 column_name=pzfield, 6941 column_type="STRING", 6942 default_value="''", 6943 ) 6944 added_columns.append(added_column) 6945 6946 # Profiles 6947 if profiles: 6948 6949 # foreach profile in configuration file 6950 for profile in prioritizations_config: 6951 6952 # If profile is asked in param, or ALL are asked (empty profile []) 6953 if profile in profiles or profiles == []: 6954 log.info(f"Profile '{profile}'") 6955 6956 sql_set_info_option = "" 6957 6958 sql_set_info = [] 6959 6960 # PZ fields set 6961 6962 # PZScore 6963 if ( 6964 f"{pz_prefix}Score{pzfields_sep}{profile}" 6965 in list_of_pzfields 6966 ): 6967 sql_set_info.append( 6968 f""" 6969 concat( 6970 '{pz_prefix}Score{pzfields_sep}{profile}=', 6971 {pz_prefix}Score{pzfields_sep}{profile} 6972 ) 6973 """ 6974 ) 6975 if ( 6976 profile == default_profile 6977 and f"{pz_prefix}Score" in list_of_pzfields 6978 ): 6979 sql_set_info.append( 6980 f""" 6981 concat( 6982 '{pz_prefix}Score=', 6983 {pz_prefix}Score{pzfields_sep}{profile} 6984 ) 6985 """ 6986 ) 6987 6988 # PZFlag 6989 if ( 6990 f"{pz_prefix}Flag{pzfields_sep}{profile}" 6991 in list_of_pzfields 6992 ): 6993 sql_set_info.append( 6994 f""" 6995 concat( 6996 '{pz_prefix}Flag{pzfields_sep}{profile}=', 6997 CASE 6998 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6999 THEN 'PASS' 7000 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7001 THEN 'FILTERED' 7002 END 7003 ) 7004 """ 7005 ) 7006 if ( 7007 profile == default_profile 7008 and f"{pz_prefix}Flag" in list_of_pzfields 7009 ): 7010 sql_set_info.append( 7011 f""" 7012 concat( 7013 '{pz_prefix}Flag=', 7014 CASE 7015 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7016 THEN 'PASS' 7017 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7018 THEN 'FILTERED' 7019 END 7020 ) 7021 """ 7022 ) 7023 7024 # PZClass 7025 if ( 7026 f"{pz_prefix}Class{pzfields_sep}{profile}" 7027 in list_of_pzfields 7028 ): 7029 sql_set_info.append( 7030 f""" 7031 concat( 7032 '{pz_prefix}Class{pzfields_sep}{profile}=', 7033 CASE 7034 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7035 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7036 ELSE '.' 7037 END 7038 ) 7039 7040 """ 7041 ) 7042 if ( 7043 profile == default_profile 7044 and f"{pz_prefix}Class" in list_of_pzfields 7045 ): 7046 sql_set_info.append( 7047 f""" 7048 concat( 7049 '{pz_prefix}Class=', 7050 CASE 7051 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7052 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7053 ELSE '.' 7054 END 7055 ) 7056 """ 7057 ) 7058 7059 # PZComment 7060 if ( 7061 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7062 in list_of_pzfields 7063 ): 7064 sql_set_info.append( 7065 f""" 7066 CASE 7067 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7068 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7069 ELSE '' 7070 END 7071 """ 7072 ) 7073 if ( 7074 profile == default_profile 7075 and f"{pz_prefix}Comment" in list_of_pzfields 7076 ): 7077 sql_set_info.append( 7078 f""" 7079 CASE 7080 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7081 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7082 ELSE '' 7083 END 7084 """ 7085 ) 7086 7087 # PZInfos 7088 if ( 7089 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7090 in list_of_pzfields 7091 ): 7092 sql_set_info.append( 7093 f""" 7094 CASE 7095 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7096 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7097 ELSE '' 7098 END 7099 """ 7100 ) 7101 if ( 7102 profile == default_profile 7103 and f"{pz_prefix}Infos" in list_of_pzfields 7104 ): 7105 sql_set_info.append( 7106 f""" 7107 CASE 7108 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7109 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7110 ELSE '' 7111 END 7112 """ 7113 ) 7114 7115 # Merge PZfields 7116 sql_set_info_option = "" 7117 sql_set_sep = "" 7118 for sql_set in sql_set_info: 7119 if sql_set_sep: 7120 sql_set_info_option += f""" 7121 , concat('{sql_set_sep}', {sql_set}) 7122 """ 7123 else: 7124 sql_set_info_option += f""" 7125 , {sql_set} 7126 """ 7127 sql_set_sep = ";" 7128 7129 sql_queries = [] 7130 for annotation in prioritizations_config[profile]: 7131 7132 # skip special sections 7133 if annotation.startswith("_"): 7134 continue 7135 7136 # For each criterions 7137 for criterion in prioritizations_config[profile][ 7138 annotation 7139 ]: 7140 7141 # Criterion mode 7142 criterion_mode = None 7143 if np.any( 7144 np.isin(list(criterion.keys()), ["type", "value"]) 7145 ): 7146 criterion_mode = "operation" 7147 elif np.any( 7148 np.isin(list(criterion.keys()), ["sql", "fields"]) 7149 ): 7150 criterion_mode = "sql" 7151 log.debug(f"Criterion Mode: {criterion_mode}") 7152 7153 # Criterion parameters 7154 criterion_type = criterion.get("type", None) 7155 criterion_value = criterion.get("value", None) 7156 criterion_sql = criterion.get("sql", None) 7157 criterion_fields = criterion.get("fields", None) 7158 criterion_score = criterion.get("score", 0) 7159 criterion_flag = criterion.get("flag", "PASS") 7160 criterion_class = criterion.get("class", None) 7161 criterion_flag_bool = criterion_flag == "PASS" 7162 criterion_comment = ( 7163 ", ".join(criterion.get("comment", [])) 7164 .replace("'", "''") 7165 .replace(";", ",") 7166 .replace("\t", " ") 7167 ) 7168 criterion_infos = ( 7169 str(criterion) 7170 .replace("'", "''") 7171 .replace(";", ",") 7172 .replace("\t", " ") 7173 ) 7174 7175 # SQL 7176 if criterion_sql is not None and isinstance( 7177 criterion_sql, list 7178 ): 7179 criterion_sql = " ".join(criterion_sql) 7180 7181 # Fields and explode 7182 if criterion_fields is None: 7183 criterion_fields = [annotation] 7184 if not isinstance(criterion_fields, list): 7185 criterion_fields = str(criterion_fields).split(",") 7186 7187 # Class 7188 if criterion_class is not None and not isinstance( 7189 criterion_class, list 7190 ): 7191 criterion_class = str(criterion_class).split(",") 7192 7193 for annotation_field in criterion_fields: 7194 7195 # Explode specific annotation 7196 log.debug( 7197 f"Explode annotation '{annotation_field}'" 7198 ) 7199 added_columns += self.explode_infos( 7200 prefix=explode_infos_prefix, 7201 fields=[annotation_field], 7202 table=table_variants, 7203 ) 7204 extra_infos = self.get_extra_infos( 7205 table=table_variants 7206 ) 7207 7208 # Check if annotation field is present 7209 if ( 7210 f"{explode_infos_prefix}{annotation_field}" 7211 not in extra_infos 7212 ): 7213 msq_err = f"Annotation '{annotation_field}' not in data" 7214 log.error(msq_err) 7215 raise ValueError(msq_err) 7216 else: 7217 log.debug( 7218 f"Annotation '{annotation_field}' in data" 7219 ) 7220 7221 sql_set = [] 7222 sql_set_info = [] 7223 7224 # PZ fields set 7225 7226 # PZScore 7227 if ( 7228 f"{pz_prefix}Score{pzfields_sep}{profile}" 7229 in list_of_pzfields 7230 ): 7231 # if prioritization_score_mode == "HOWARD": 7232 # sql_set.append( 7233 # f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7234 # ) 7235 # VaRank prioritization score mode 7236 if prioritization_score_mode == "VaRank": 7237 sql_set.append( 7238 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7239 ) 7240 # default HOWARD prioritization score mode 7241 else: 7242 sql_set.append( 7243 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7244 ) 7245 7246 # PZFlag 7247 if ( 7248 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7249 in list_of_pzfields 7250 ): 7251 sql_set.append( 7252 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7253 ) 7254 7255 # PZClass 7256 if ( 7257 f"{pz_prefix}Class{pzfields_sep}{profile}" 7258 in list_of_pzfields 7259 and criterion_class is not None 7260 ): 7261 sql_set.append( 7262 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7263 ) 7264 7265 # PZComment 7266 if ( 7267 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7268 in list_of_pzfields 7269 ): 7270 sql_set.append( 7271 f""" 7272 {pz_prefix}Comment{pzfields_sep}{profile} = 7273 concat( 7274 {pz_prefix}Comment{pzfields_sep}{profile}, 7275 CASE 7276 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7277 THEN ', ' 7278 ELSE '' 7279 END, 7280 '{criterion_comment}' 7281 ) 7282 """ 7283 ) 7284 7285 # PZInfos 7286 if ( 7287 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7288 in list_of_pzfields 7289 ): 7290 sql_set.append( 7291 f""" 7292 {pz_prefix}Infos{pzfields_sep}{profile} = 7293 concat( 7294 {pz_prefix}Infos{pzfields_sep}{profile}, 7295 '{criterion_infos}' 7296 ) 7297 """ 7298 ) 7299 sql_set_option = ",".join(sql_set) 7300 7301 # Criterion and comparison 7302 if sql_set_option: 7303 7304 if criterion_mode in ["operation"]: 7305 7306 try: 7307 float(criterion_value) 7308 sql_update = f""" 7309 UPDATE {table_variants} 7310 SET {sql_set_option} 7311 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7312 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7313 """ 7314 except: 7315 contains_option = "" 7316 if criterion_type == "contains": 7317 contains_option = ".*" 7318 sql_update = f""" 7319 UPDATE {table_variants} 7320 SET {sql_set_option} 7321 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7322 """ 7323 sql_queries.append(sql_update) 7324 7325 elif criterion_mode in ["sql"]: 7326 7327 sql_update = f""" 7328 UPDATE {table_variants} 7329 SET {sql_set_option} 7330 WHERE {criterion_sql} 7331 """ 7332 sql_queries.append(sql_update) 7333 7334 else: 7335 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7336 log.error(msg_err) 7337 raise ValueError(msg_err) 7338 7339 else: 7340 log.warning( 7341 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7342 ) 7343 7344 # PZTags 7345 if ( 7346 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7347 in list_of_pzfields 7348 ): 7349 7350 # Create PZFalgs value 7351 pztags_value = "" 7352 pztags_sep_default = "," 7353 pztags_sep = "" 7354 for pzfield in pzfields: 7355 if pzfield not in [f"{pz_prefix}Tags"]: 7356 if ( 7357 f"{pzfield}{pzfields_sep}{profile}" 7358 in list_of_pzfields 7359 ): 7360 if pzfield in [f"{pz_prefix}Flag"]: 7361 pztags_value += f"""{pztags_sep}{pzfield}#', 7362 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7363 THEN 'PASS' 7364 ELSE 'FILTERED' 7365 END, '""" 7366 elif pzfield in [f"{pz_prefix}Class"]: 7367 pztags_value += f"""{pztags_sep}{pzfield}#', 7368 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7369 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7370 ELSE '.' 7371 END, '""" 7372 else: 7373 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7374 pztags_sep = pztags_sep_default 7375 7376 # Add Query update for PZFlags 7377 sql_update_pztags = f""" 7378 UPDATE {table_variants} 7379 SET INFO = concat( 7380 INFO, 7381 CASE WHEN INFO NOT in ('','.') 7382 THEN ';' 7383 ELSE '' 7384 END, 7385 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7386 ) 7387 """ 7388 sql_queries.append(sql_update_pztags) 7389 7390 # Add Query update for PZFlags for default 7391 if profile == default_profile: 7392 sql_update_pztags_default = f""" 7393 UPDATE {table_variants} 7394 SET INFO = concat( 7395 INFO, 7396 ';', 7397 '{pz_prefix}Tags={pztags_value}' 7398 ) 7399 """ 7400 sql_queries.append(sql_update_pztags_default) 7401 7402 log.info(f"""Profile '{profile}' - Prioritization... """) 7403 7404 if sql_queries: 7405 7406 for sql_query in sql_queries: 7407 log.debug( 7408 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7409 ) 7410 self.conn.execute(sql_query) 7411 7412 log.info(f"""Profile '{profile}' - Update... """) 7413 sql_query_update = f""" 7414 UPDATE {table_variants} 7415 SET INFO = 7416 concat( 7417 CASE 7418 WHEN INFO NOT IN ('','.') 7419 THEN concat(INFO, ';') 7420 ELSE '' 7421 END 7422 {sql_set_info_option} 7423 ) 7424 """ 7425 self.conn.execute(sql_query_update) 7426 7427 else: 7428 7429 log.warning(f"No profiles in parameters") 7430 7431 # Remove added columns 7432 for added_column in added_columns: 7433 self.drop_column(column=added_column) 7434 7435 # Explode INFOS fields into table fields 7436 if self.get_explode_infos(): 7437 self.explode_infos( 7438 prefix=self.get_explode_infos_prefix(), 7439 fields=self.get_explode_infos_fields(), 7440 force=True, 7441 ) 7442 7443 return True 7444 7445 ### 7446 # HGVS 7447 ### 7448 7449 def annotation_hgvs(self, threads: int = None) -> None: 7450 """ 7451 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7452 coordinates and alleles. 7453 7454 :param threads: The `threads` parameter is an optional integer that specifies the number of 7455 threads to use for parallel processing. If no value is provided, it will default to the number 7456 of threads obtained from the `get_threads()` method 7457 :type threads: int 7458 """ 7459 7460 # Function for each partition of the Dask Dataframe 7461 def partition_function(partition): 7462 """ 7463 The function `partition_function` applies the `annotation_hgvs_partition` function to 7464 each row of a DataFrame called `partition`. 7465 7466 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7467 to be processed 7468 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7469 the "partition" dataframe along the axis 1. 7470 """ 7471 return partition.apply(annotation_hgvs_partition, axis=1) 7472 7473 def annotation_hgvs_partition(row) -> str: 7474 """ 7475 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7476 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7477 7478 :param row: A dictionary-like object that contains the values for the following keys: 7479 :return: a string that contains the HGVS names associated with the given row of data. 7480 """ 7481 7482 chr = row["CHROM"] 7483 pos = row["POS"] 7484 ref = row["REF"] 7485 alt = row["ALT"] 7486 7487 # Find list of associated transcripts 7488 transcripts_list = list( 7489 polars_conn.execute( 7490 f""" 7491 SELECT transcript 7492 FROM refseq_df 7493 WHERE CHROM='{chr}' 7494 AND POS={pos} 7495 """ 7496 )["transcript"] 7497 ) 7498 7499 # Full HGVS annotation in list 7500 hgvs_full_list = [] 7501 7502 for transcript_name in transcripts_list: 7503 7504 # Transcript 7505 transcript = get_transcript( 7506 transcripts=transcripts, transcript_name=transcript_name 7507 ) 7508 # Exon 7509 if use_exon: 7510 exon = transcript.find_exon_number(pos) 7511 else: 7512 exon = None 7513 # Protein 7514 transcript_protein = None 7515 if use_protein or add_protein or full_format: 7516 transcripts_protein = list( 7517 polars_conn.execute( 7518 f""" 7519 SELECT protein 7520 FROM refseqlink_df 7521 WHERE transcript='{transcript_name}' 7522 LIMIT 1 7523 """ 7524 )["protein"] 7525 ) 7526 if len(transcripts_protein): 7527 transcript_protein = transcripts_protein[0] 7528 7529 # HGVS name 7530 hgvs_name = format_hgvs_name( 7531 chr, 7532 pos, 7533 ref, 7534 alt, 7535 genome=genome, 7536 transcript=transcript, 7537 transcript_protein=transcript_protein, 7538 exon=exon, 7539 use_gene=use_gene, 7540 use_protein=use_protein, 7541 full_format=full_format, 7542 use_version=use_version, 7543 codon_type=codon_type, 7544 ) 7545 hgvs_full_list.append(hgvs_name) 7546 if add_protein and not use_protein and not full_format: 7547 hgvs_name = format_hgvs_name( 7548 chr, 7549 pos, 7550 ref, 7551 alt, 7552 genome=genome, 7553 transcript=transcript, 7554 transcript_protein=transcript_protein, 7555 exon=exon, 7556 use_gene=use_gene, 7557 use_protein=True, 7558 full_format=False, 7559 use_version=use_version, 7560 codon_type=codon_type, 7561 ) 7562 hgvs_full_list.append(hgvs_name) 7563 7564 # Create liste of HGVS annotations 7565 hgvs_full = ",".join(hgvs_full_list) 7566 7567 return hgvs_full 7568 7569 # Polars connexion 7570 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7571 7572 # Config 7573 config = self.get_config() 7574 7575 # Databases 7576 # Genome 7577 databases_genomes_folders = ( 7578 config.get("folders", {}) 7579 .get("databases", {}) 7580 .get("genomes", DEFAULT_GENOME_FOLDER) 7581 ) 7582 databases_genome = ( 7583 config.get("folders", {}).get("databases", {}).get("genomes", "") 7584 ) 7585 # refseq database folder 7586 databases_refseq_folders = ( 7587 config.get("folders", {}) 7588 .get("databases", {}) 7589 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7590 ) 7591 # refseq 7592 databases_refseq = config.get("databases", {}).get("refSeq", None) 7593 # refSeqLink 7594 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7595 7596 # Param 7597 param = self.get_param() 7598 7599 # Quick HGVS 7600 if "hgvs_options" in param and param.get("hgvs_options", ""): 7601 log.info(f"Quick HGVS Annotation:") 7602 if not param.get("hgvs", None): 7603 param["hgvs"] = {} 7604 for option in param.get("hgvs_options", "").split(","): 7605 option_var_val = option.split("=") 7606 option_var = option_var_val[0] 7607 if len(option_var_val) > 1: 7608 option_val = option_var_val[1] 7609 else: 7610 option_val = "True" 7611 if option_val.upper() in ["TRUE"]: 7612 option_val = True 7613 elif option_val.upper() in ["FALSE"]: 7614 option_val = False 7615 log.info(f" {option_var}={option_val}") 7616 param["hgvs"][option_var] = option_val 7617 7618 # Check if HGVS annotation enabled 7619 if "hgvs" in param: 7620 log.info(f"HGVS Annotation... ") 7621 for hgvs_option in param.get("hgvs", {}): 7622 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7623 else: 7624 return 7625 7626 # HGVS Param 7627 param_hgvs = param.get("hgvs", {}) 7628 use_exon = param_hgvs.get("use_exon", False) 7629 use_gene = param_hgvs.get("use_gene", False) 7630 use_protein = param_hgvs.get("use_protein", False) 7631 add_protein = param_hgvs.get("add_protein", False) 7632 full_format = param_hgvs.get("full_format", False) 7633 use_version = param_hgvs.get("use_version", False) 7634 codon_type = param_hgvs.get("codon_type", "3") 7635 7636 # refSseq refSeqLink 7637 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7638 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7639 7640 # Assembly 7641 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7642 7643 # Genome 7644 genome_file = None 7645 if find_genome(databases_genome): 7646 genome_file = find_genome(databases_genome) 7647 else: 7648 genome_file = find_genome( 7649 genome_path=databases_genomes_folders, assembly=assembly 7650 ) 7651 log.debug("Genome: " + str(genome_file)) 7652 7653 # refSseq 7654 refseq_file = find_file_prefix( 7655 input_file=databases_refseq, 7656 prefix="ncbiRefSeq", 7657 folder=databases_refseq_folders, 7658 assembly=assembly, 7659 ) 7660 log.debug("refSeq: " + str(refseq_file)) 7661 7662 # refSeqLink 7663 refseqlink_file = find_file_prefix( 7664 input_file=databases_refseqlink, 7665 prefix="ncbiRefSeqLink", 7666 folder=databases_refseq_folders, 7667 assembly=assembly, 7668 ) 7669 log.debug("refSeqLink: " + str(refseqlink_file)) 7670 7671 # Threads 7672 if not threads: 7673 threads = self.get_threads() 7674 log.debug("Threads: " + str(threads)) 7675 7676 # Variables 7677 table_variants = self.get_table_variants(clause="update") 7678 7679 # Get variants SNV and InDel only 7680 query_variants = f""" 7681 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7682 FROM {table_variants} 7683 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7684 """ 7685 df_variants = self.get_query_to_df(query_variants) 7686 7687 # Added columns 7688 added_columns = [] 7689 7690 # Add hgvs column in variants table 7691 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7692 added_column = self.add_column( 7693 table_variants, hgvs_column_name, "STRING", default_value=None 7694 ) 7695 added_columns.append(added_column) 7696 7697 log.debug(f"refSeq loading...") 7698 # refSeq in duckDB 7699 refseq_table = get_refseq_table( 7700 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7701 ) 7702 # Loading all refSeq in Dataframe 7703 refseq_query = f""" 7704 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7705 FROM {refseq_table} 7706 JOIN df_variants ON ( 7707 {refseq_table}.chrom = df_variants.CHROM 7708 AND {refseq_table}.txStart<=df_variants.POS 7709 AND {refseq_table}.txEnd>=df_variants.POS 7710 ) 7711 """ 7712 refseq_df = self.conn.query(refseq_query).pl() 7713 7714 if refseqlink_file: 7715 log.debug(f"refSeqLink loading...") 7716 # refSeqLink in duckDB 7717 refseqlink_table = get_refseq_table( 7718 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7719 ) 7720 # Loading all refSeqLink in Dataframe 7721 protacc_column = "protAcc_with_ver" 7722 mrnaacc_column = "mrnaAcc_with_ver" 7723 refseqlink_query = f""" 7724 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7725 FROM {refseqlink_table} 7726 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7727 WHERE protAcc_without_ver IS NOT NULL 7728 """ 7729 # Polars Dataframe 7730 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7731 7732 # Read RefSeq transcripts into a python dict/model. 7733 log.debug(f"Transcripts loading...") 7734 with tempfile.TemporaryDirectory() as tmpdir: 7735 transcripts_query = f""" 7736 COPY ( 7737 SELECT {refseq_table}.* 7738 FROM {refseq_table} 7739 JOIN df_variants ON ( 7740 {refseq_table}.chrom=df_variants.CHROM 7741 AND {refseq_table}.txStart<=df_variants.POS 7742 AND {refseq_table}.txEnd>=df_variants.POS 7743 ) 7744 ) 7745 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7746 """ 7747 self.conn.query(transcripts_query) 7748 with open(f"{tmpdir}/transcript.tsv") as infile: 7749 transcripts = read_transcripts(infile) 7750 7751 # Polars connexion 7752 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7753 7754 log.debug("Genome loading...") 7755 # Read genome sequence using pyfaidx. 7756 genome = Fasta(genome_file) 7757 7758 log.debug("Start annotation HGVS...") 7759 7760 # Create 7761 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7762 ddf = dd.from_pandas(df_variants, npartitions=threads) 7763 7764 # Use dask.dataframe.apply() to apply function on each partition 7765 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7766 7767 # Convert Dask DataFrame to Pandas Dataframe 7768 df = ddf.compute() 7769 7770 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7771 with tempfile.TemporaryDirectory() as tmpdir: 7772 df_parquet = os.path.join(tmpdir, "df.parquet") 7773 df.to_parquet(df_parquet) 7774 7775 # Update hgvs column 7776 update_variant_query = f""" 7777 UPDATE {table_variants} 7778 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7779 FROM read_parquet('{df_parquet}') as df 7780 WHERE variants."#CHROM" = df.CHROM 7781 AND variants.POS = df.POS 7782 AND variants.REF = df.REF 7783 AND variants.ALT = df.ALT 7784 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7785 """ 7786 self.execute_query(update_variant_query) 7787 7788 # Update INFO column 7789 sql_query_update = f""" 7790 UPDATE {table_variants} 7791 SET INFO = 7792 concat( 7793 CASE 7794 WHEN INFO NOT IN ('','.') 7795 THEN concat(INFO, ';') 7796 ELSE '' 7797 END, 7798 'hgvs=', 7799 {hgvs_column_name} 7800 ) 7801 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7802 """ 7803 self.execute_query(sql_query_update) 7804 7805 # Add header 7806 HGVS_INFOS = { 7807 "hgvs": { 7808 "ID": "hgvs", 7809 "Number": ".", 7810 "Type": "String", 7811 "Description": f"HGVS annotatation with HOWARD", 7812 } 7813 } 7814 7815 for field in HGVS_INFOS: 7816 field_ID = HGVS_INFOS[field]["ID"] 7817 field_description = HGVS_INFOS[field]["Description"] 7818 self.get_header().infos[field_ID] = vcf.parser._Info( 7819 field_ID, 7820 HGVS_INFOS[field]["Number"], 7821 HGVS_INFOS[field]["Type"], 7822 field_description, 7823 "unknown", 7824 "unknown", 7825 code_type_map[HGVS_INFOS[field]["Type"]], 7826 ) 7827 7828 # Remove added columns 7829 for added_column in added_columns: 7830 self.drop_column(column=added_column) 7831 7832 ### 7833 # Calculation 7834 ### 7835 7836 def get_operations_help( 7837 self, operations_config_dict: dict = {}, operations_config_file: str = None 7838 ) -> list: 7839 7840 # Init 7841 operations_help = [] 7842 7843 # operations 7844 operations = self.get_config_json( 7845 name="calculations", 7846 config_dict=operations_config_dict, 7847 config_file=operations_config_file, 7848 ) 7849 for op in operations: 7850 op_name = operations[op].get("name", op).upper() 7851 op_description = operations[op].get("description", op_name) 7852 op_available = operations[op].get("available", False) 7853 if op_available: 7854 operations_help.append(f" {op_name}: {op_description}") 7855 7856 # Sort operations 7857 operations_help.sort() 7858 7859 # insert header 7860 operations_help.insert(0, "Available calculation operations:") 7861 7862 # Return 7863 return operations_help 7864 7865 def calculation( 7866 self, 7867 operations: dict = {}, 7868 operations_config_dict: dict = {}, 7869 operations_config_file: str = None, 7870 ) -> None: 7871 """ 7872 It takes a list of operations, and for each operation, it checks if it's a python or sql 7873 operation, and then calls the appropriate function 7874 7875 param json example: 7876 "calculation": { 7877 "NOMEN": { 7878 "options": { 7879 "hgvs_field": "hgvs" 7880 }, 7881 "middle" : null 7882 } 7883 """ 7884 7885 # Param 7886 param = self.get_param() 7887 7888 # operations config 7889 operations_config = self.get_config_json( 7890 name="calculations", 7891 config_dict=operations_config_dict, 7892 config_file=operations_config_file, 7893 ) 7894 7895 # Upper keys 7896 operations_config = {k.upper(): v for k, v in operations_config.items()} 7897 7898 # Calculations 7899 7900 # Operations from param 7901 operations = param.get("calculation", {}).get("calculations", operations) 7902 7903 # Quick calculation - add 7904 if param.get("calculations", None): 7905 calculations_list = [ 7906 value for value in param.get("calculations", "").split(",") 7907 ] 7908 log.info(f"Quick Calculations:") 7909 for calculation_key in calculations_list: 7910 log.info(f" {calculation_key}") 7911 for calculation_operation in calculations_list: 7912 if calculation_operation.upper() not in operations: 7913 operations[calculation_operation.upper()] = {} 7914 add_value_into_dict( 7915 dict_tree=param, 7916 sections=[ 7917 "calculation", 7918 "calculations", 7919 calculation_operation.upper(), 7920 ], 7921 value={}, 7922 ) 7923 7924 # Operations for calculation 7925 if not operations: 7926 operations = param.get("calculation", {}).get("calculations", {}) 7927 7928 if operations: 7929 log.info(f"Calculations...") 7930 7931 # For each operations 7932 for operation_name in operations: 7933 operation_name = operation_name.upper() 7934 if operation_name not in [""]: 7935 if operation_name in operations_config: 7936 log.info(f"Calculation '{operation_name}'") 7937 operation = operations_config[operation_name] 7938 operation_type = operation.get("type", "sql") 7939 if operation_type == "python": 7940 self.calculation_process_function( 7941 operation=operation, operation_name=operation_name 7942 ) 7943 elif operation_type == "sql": 7944 self.calculation_process_sql( 7945 operation=operation, operation_name=operation_name 7946 ) 7947 else: 7948 log.error( 7949 f"Operations config: Type '{operation_type}' NOT available" 7950 ) 7951 raise ValueError( 7952 f"Operations config: Type '{operation_type}' NOT available" 7953 ) 7954 else: 7955 log.error( 7956 f"Operations config: Calculation '{operation_name}' NOT available" 7957 ) 7958 raise ValueError( 7959 f"Operations config: Calculation '{operation_name}' NOT available" 7960 ) 7961 7962 # Explode INFOS fields into table fields 7963 if self.get_explode_infos(): 7964 self.explode_infos( 7965 prefix=self.get_explode_infos_prefix(), 7966 fields=self.get_explode_infos_fields(), 7967 force=True, 7968 ) 7969 7970 def calculation_process_sql( 7971 self, operation: dict, operation_name: str = "unknown" 7972 ) -> None: 7973 """ 7974 The `calculation_process_sql` function takes in a mathematical operation as a string and 7975 performs the operation, updating the specified table with the result. 7976 7977 :param operation: The `operation` parameter is a dictionary that contains information about the 7978 mathematical operation to be performed. It includes the following keys: 7979 :type operation: dict 7980 :param operation_name: The `operation_name` parameter is a string that represents the name of 7981 the mathematical operation being performed. It is used for logging and error handling purposes, 7982 defaults to unknown 7983 :type operation_name: str (optional) 7984 """ 7985 7986 # table variants 7987 table_variants = self.get_table_variants(clause="alter") 7988 7989 # Operation infos 7990 operation_name = operation.get("name", "unknown") 7991 log.debug(f"process sql {operation_name}") 7992 output_column_name = operation.get("output_column_name", operation_name) 7993 output_column_type = operation.get("output_column_type", "String") 7994 prefix = operation.get("explode_infos_prefix", "") 7995 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7996 output_column_description = operation.get( 7997 "output_column_description", f"{operation_name} operation" 7998 ) 7999 operation_query = operation.get("operation_query", None) 8000 if isinstance(operation_query, list): 8001 operation_query = " ".join(operation_query) 8002 operation_info_fields = operation.get("info_fields", []) 8003 operation_info_fields_check = operation.get("info_fields_check", False) 8004 operation_info = operation.get("operation_info", True) 8005 8006 if operation_query: 8007 8008 # Info fields check 8009 operation_info_fields_check_result = True 8010 if operation_info_fields_check: 8011 header_infos = self.get_header().infos 8012 for info_field in operation_info_fields: 8013 operation_info_fields_check_result = ( 8014 operation_info_fields_check_result 8015 and info_field in header_infos 8016 ) 8017 8018 # If info fields available 8019 if operation_info_fields_check_result: 8020 8021 # Added_columns 8022 added_columns = [] 8023 8024 # Create VCF header field 8025 vcf_reader = self.get_header() 8026 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8027 output_column_name, 8028 ".", 8029 output_column_type, 8030 output_column_description, 8031 "howard calculation", 8032 "0", 8033 self.code_type_map.get(output_column_type), 8034 ) 8035 8036 # Explode infos if needed 8037 log.debug(f"calculation_process_sql prefix {prefix}") 8038 added_columns += self.explode_infos( 8039 prefix=prefix, 8040 fields=[output_column_name] + operation_info_fields, 8041 force=True, 8042 ) 8043 8044 # Create column 8045 added_column = self.add_column( 8046 table_name=table_variants, 8047 column_name=prefix + output_column_name, 8048 column_type=output_column_type_sql, 8049 default_value="null", 8050 ) 8051 added_columns.append(added_column) 8052 8053 # Operation calculation 8054 try: 8055 8056 # Query to update calculation column 8057 sql_update = f""" 8058 UPDATE {table_variants} 8059 SET "{prefix}{output_column_name}" = ({operation_query}) 8060 """ 8061 self.conn.execute(sql_update) 8062 8063 # Add to INFO 8064 if operation_info: 8065 sql_update_info = f""" 8066 UPDATE {table_variants} 8067 SET "INFO" = 8068 concat( 8069 CASE 8070 WHEN "INFO" IS NOT NULL 8071 THEN concat("INFO", ';') 8072 ELSE '' 8073 END, 8074 '{output_column_name}=', 8075 "{prefix}{output_column_name}" 8076 ) 8077 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8078 """ 8079 self.conn.execute(sql_update_info) 8080 8081 except: 8082 log.error( 8083 f"Operations config: Calculation '{operation_name}' query failed" 8084 ) 8085 raise ValueError( 8086 f"Operations config: Calculation '{operation_name}' query failed" 8087 ) 8088 8089 # Remove added columns 8090 for added_column in added_columns: 8091 log.debug(f"added_column: {added_column}") 8092 self.drop_column(column=added_column) 8093 8094 else: 8095 log.error( 8096 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8097 ) 8098 raise ValueError( 8099 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8100 ) 8101 8102 else: 8103 log.error( 8104 f"Operations config: Calculation '{operation_name}' query NOT defined" 8105 ) 8106 raise ValueError( 8107 f"Operations config: Calculation '{operation_name}' query NOT defined" 8108 ) 8109 8110 def calculation_process_function( 8111 self, operation: dict, operation_name: str = "unknown" 8112 ) -> None: 8113 """ 8114 The `calculation_process_function` takes in an operation dictionary and performs the specified 8115 function with the given parameters. 8116 8117 :param operation: The `operation` parameter is a dictionary that contains information about the 8118 operation to be performed. It has the following keys: 8119 :type operation: dict 8120 :param operation_name: The `operation_name` parameter is a string that represents the name of 8121 the operation being performed. It is used for logging purposes, defaults to unknown 8122 :type operation_name: str (optional) 8123 """ 8124 8125 operation_name = operation["name"] 8126 log.debug(f"process sql {operation_name}") 8127 function_name = operation["function_name"] 8128 function_params = operation["function_params"] 8129 getattr(self, function_name)(*function_params) 8130 8131 def calculation_variant_id(self) -> None: 8132 """ 8133 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8134 updates the INFO field of a variants table with the variant ID. 8135 """ 8136 8137 # variant_id annotation field 8138 variant_id_tag = self.get_variant_id_column() 8139 added_columns = [variant_id_tag] 8140 8141 # variant_id hgvs tags" 8142 vcf_infos_tags = { 8143 variant_id_tag: "howard variant ID annotation", 8144 } 8145 8146 # Variants table 8147 table_variants = self.get_table_variants() 8148 8149 # Header 8150 vcf_reader = self.get_header() 8151 8152 # Add variant_id to header 8153 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8154 variant_id_tag, 8155 ".", 8156 "String", 8157 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8158 "howard calculation", 8159 "0", 8160 self.code_type_map.get("String"), 8161 ) 8162 8163 # Update 8164 sql_update = f""" 8165 UPDATE {table_variants} 8166 SET "INFO" = 8167 concat( 8168 CASE 8169 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8170 THEN '' 8171 ELSE concat("INFO", ';') 8172 END, 8173 '{variant_id_tag}=', 8174 "{variant_id_tag}" 8175 ) 8176 """ 8177 self.conn.execute(sql_update) 8178 8179 # Remove added columns 8180 for added_column in added_columns: 8181 self.drop_column(column=added_column) 8182 8183 def calculation_extract_snpeff_hgvs( 8184 self, 8185 snpeff_hgvs: str = "snpeff_hgvs", 8186 snpeff_field: str = "ANN", 8187 ) -> None: 8188 """ 8189 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8190 annotation field in a VCF file and adds them as a new column in the variants table. 8191 8192 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8193 function is used to specify the name of the column that will store the HGVS nomenclatures 8194 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8195 snpeff_hgvs 8196 :type snpeff_hgvs: str (optional) 8197 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8198 function represents the field in the VCF file that contains SnpEff annotations. This field is 8199 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8200 to ANN 8201 :type snpeff_field: str (optional) 8202 """ 8203 8204 # Snpeff hgvs tags 8205 vcf_infos_tags = { 8206 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8207 } 8208 8209 # Prefix 8210 prefix = self.get_explode_infos_prefix() 8211 if prefix: 8212 prefix = "INFO/" 8213 8214 # snpEff fields 8215 speff_ann_infos = prefix + snpeff_field 8216 speff_hgvs_infos = prefix + snpeff_hgvs 8217 8218 # Variants table 8219 table_variants = self.get_table_variants() 8220 8221 # Header 8222 vcf_reader = self.get_header() 8223 8224 # Add columns 8225 added_columns = [] 8226 8227 # Explode HGVS field in column 8228 added_columns += self.explode_infos(fields=[snpeff_field]) 8229 8230 if snpeff_field in vcf_reader.infos: 8231 8232 log.debug(vcf_reader.infos[snpeff_field]) 8233 8234 # Extract ANN header 8235 ann_description = vcf_reader.infos[snpeff_field].desc 8236 pattern = r"'(.+?)'" 8237 match = re.search(pattern, ann_description) 8238 if match: 8239 ann_header_match = match.group(1).split(" | ") 8240 ann_header_desc = {} 8241 for i in range(len(ann_header_match)): 8242 ann_header_info = "".join( 8243 char for char in ann_header_match[i] if char.isalnum() 8244 ) 8245 ann_header_desc[ann_header_info] = ann_header_match[i] 8246 if not ann_header_desc: 8247 raise ValueError("Invalid header description format") 8248 else: 8249 raise ValueError("Invalid header description format") 8250 8251 # Create variant id 8252 variant_id_column = self.get_variant_id_column() 8253 added_columns += [variant_id_column] 8254 8255 # Create dataframe 8256 dataframe_snpeff_hgvs = self.get_query_to_df( 8257 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8258 ) 8259 8260 # Create main NOMEN column 8261 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8262 speff_ann_infos 8263 ].apply( 8264 lambda x: extract_snpeff_hgvs( 8265 str(x), header=list(ann_header_desc.values()) 8266 ) 8267 ) 8268 8269 # Add snpeff_hgvs to header 8270 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8271 snpeff_hgvs, 8272 ".", 8273 "String", 8274 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8275 "howard calculation", 8276 "0", 8277 self.code_type_map.get("String"), 8278 ) 8279 8280 # Update 8281 sql_update = f""" 8282 UPDATE variants 8283 SET "INFO" = 8284 concat( 8285 CASE 8286 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8287 THEN '' 8288 ELSE concat("INFO", ';') 8289 END, 8290 CASE 8291 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8292 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8293 THEN concat( 8294 '{snpeff_hgvs}=', 8295 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8296 ) 8297 ELSE '' 8298 END 8299 ) 8300 FROM dataframe_snpeff_hgvs 8301 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8302 8303 """ 8304 self.conn.execute(sql_update) 8305 8306 # Delete dataframe 8307 del dataframe_snpeff_hgvs 8308 gc.collect() 8309 8310 else: 8311 8312 log.warning( 8313 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8314 ) 8315 8316 # Remove added columns 8317 for added_column in added_columns: 8318 self.drop_column(column=added_column) 8319 8320 def calculation_snpeff_ann_explode( 8321 self, 8322 uniquify: bool = True, 8323 output_format: str = "fields", 8324 output_prefix: str = "snpeff_", 8325 snpeff_field: str = "ANN", 8326 ) -> None: 8327 """ 8328 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8329 exploding the HGVS field and updating variant information accordingly. 8330 8331 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8332 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8333 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8334 defaults to True 8335 :type uniquify: bool (optional) 8336 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8337 function specifies the format in which the output annotations will be generated. It has a 8338 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8339 format, defaults to fields 8340 :type output_format: str (optional) 8341 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8342 method is used to specify the prefix that will be added to the output annotations generated 8343 during the calculation process. This prefix helps to differentiate the newly added annotations 8344 from existing ones in the output data. By default, the, defaults to ANN_ 8345 :type output_prefix: str (optional) 8346 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8347 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8348 field will be processed to explode the HGVS annotations and update the variant information 8349 accordingly, defaults to ANN 8350 :type snpeff_field: str (optional) 8351 """ 8352 8353 # SnpEff annotation field 8354 snpeff_hgvs = "snpeff_ann_explode" 8355 8356 # Snpeff hgvs tags 8357 vcf_infos_tags = { 8358 snpeff_hgvs: "Explode snpEff annotations", 8359 } 8360 8361 # Prefix 8362 prefix = self.get_explode_infos_prefix() 8363 if prefix: 8364 prefix = "INFO/" 8365 8366 # snpEff fields 8367 speff_ann_infos = prefix + snpeff_field 8368 speff_hgvs_infos = prefix + snpeff_hgvs 8369 8370 # Variants table 8371 table_variants = self.get_table_variants() 8372 8373 # Header 8374 vcf_reader = self.get_header() 8375 8376 # Add columns 8377 added_columns = [] 8378 8379 # Explode HGVS field in column 8380 added_columns += self.explode_infos(fields=[snpeff_field]) 8381 log.debug(f"snpeff_field={snpeff_field}") 8382 log.debug(f"added_columns={added_columns}") 8383 8384 if snpeff_field in vcf_reader.infos: 8385 8386 # Extract ANN header 8387 ann_description = vcf_reader.infos[snpeff_field].desc 8388 pattern = r"'(.+?)'" 8389 match = re.search(pattern, ann_description) 8390 if match: 8391 ann_header_match = match.group(1).split(" | ") 8392 ann_header = [] 8393 ann_header_desc = {} 8394 for i in range(len(ann_header_match)): 8395 ann_header_info = "".join( 8396 char for char in ann_header_match[i] if char.isalnum() 8397 ) 8398 ann_header.append(ann_header_info) 8399 ann_header_desc[ann_header_info] = ann_header_match[i] 8400 if not ann_header_desc: 8401 raise ValueError("Invalid header description format") 8402 else: 8403 raise ValueError("Invalid header description format") 8404 8405 # Create variant id 8406 variant_id_column = self.get_variant_id_column() 8407 added_columns += [variant_id_column] 8408 8409 # Create dataframe 8410 dataframe_snpeff_hgvs = self.get_query_to_df( 8411 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8412 ) 8413 8414 # Create snpEff columns 8415 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8416 speff_ann_infos 8417 ].apply( 8418 lambda x: explode_snpeff_ann( 8419 str(x), 8420 uniquify=uniquify, 8421 output_format=output_format, 8422 prefix=output_prefix, 8423 header=list(ann_header_desc.values()), 8424 ) 8425 ) 8426 8427 # Header 8428 ann_annotations_prefix = "" 8429 if output_format.upper() in ["JSON"]: 8430 ann_annotations_prefix = f"{output_prefix}=" 8431 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8432 output_prefix, 8433 ".", 8434 "String", 8435 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8436 + " - JSON format", 8437 "howard calculation", 8438 "0", 8439 self.code_type_map.get("String"), 8440 ) 8441 else: 8442 for ann_annotation in ann_header: 8443 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8444 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8445 ann_annotation_id, 8446 ".", 8447 "String", 8448 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8449 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8450 "howard calculation", 8451 "0", 8452 self.code_type_map.get("String"), 8453 ) 8454 8455 # Update 8456 sql_update = f""" 8457 UPDATE variants 8458 SET "INFO" = 8459 concat( 8460 CASE 8461 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8462 THEN '' 8463 ELSE concat("INFO", ';') 8464 END, 8465 CASE 8466 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8467 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8468 THEN concat( 8469 '{ann_annotations_prefix}', 8470 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8471 ) 8472 ELSE '' 8473 END 8474 ) 8475 FROM dataframe_snpeff_hgvs 8476 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8477 8478 """ 8479 self.conn.execute(sql_update) 8480 8481 # Delete dataframe 8482 del dataframe_snpeff_hgvs 8483 gc.collect() 8484 8485 else: 8486 8487 log.warning( 8488 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8489 ) 8490 8491 # Remove added columns 8492 for added_column in added_columns: 8493 self.drop_column(column=added_column) 8494 8495 def calculation_extract_nomen(self) -> None: 8496 """ 8497 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8498 """ 8499 8500 # NOMEN field 8501 field_nomen_dict = "NOMEN_DICT" 8502 8503 # NOMEN structure 8504 nomen_dict = { 8505 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8506 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8507 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8508 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8509 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8510 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8511 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8512 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8513 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8514 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8515 } 8516 8517 # Param 8518 param = self.get_param() 8519 8520 # Prefix 8521 prefix = self.get_explode_infos_prefix() 8522 8523 # Header 8524 vcf_reader = self.get_header() 8525 8526 # Get HGVS field 8527 hgvs_field = ( 8528 param.get("calculation", {}) 8529 .get("calculations", {}) 8530 .get("NOMEN", {}) 8531 .get("options", {}) 8532 .get("hgvs_field", "hgvs") 8533 ) 8534 8535 # Get transcripts 8536 transcripts_file = ( 8537 param.get("calculation", {}) 8538 .get("calculations", {}) 8539 .get("NOMEN", {}) 8540 .get("options", {}) 8541 .get("transcripts", None) 8542 ) 8543 transcripts_file = full_path(transcripts_file) 8544 transcripts = [] 8545 if transcripts_file: 8546 if os.path.exists(transcripts_file): 8547 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8548 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8549 else: 8550 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8551 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8552 8553 # Added columns 8554 added_columns = [] 8555 8556 # Explode HGVS field in column 8557 added_columns += self.explode_infos(fields=[hgvs_field]) 8558 8559 # extra infos 8560 extra_infos = self.get_extra_infos() 8561 extra_field = prefix + hgvs_field 8562 8563 if extra_field in extra_infos: 8564 8565 # Create dataframe 8566 dataframe_hgvs = self.get_query_to_df( 8567 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8568 ) 8569 8570 # Create main NOMEN column 8571 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8572 lambda x: find_nomen(str(x), transcripts=transcripts) 8573 ) 8574 8575 # Explode NOMEN Structure and create SQL set for update 8576 sql_nomen_fields = [] 8577 for nomen_field in nomen_dict: 8578 8579 # Explode each field into a column 8580 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8581 lambda x: dict(x).get(nomen_field, "") 8582 ) 8583 8584 # Create VCF header field 8585 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8586 nomen_field, 8587 ".", 8588 "String", 8589 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8590 "howard calculation", 8591 "0", 8592 self.code_type_map.get("String"), 8593 ) 8594 sql_nomen_fields.append( 8595 f""" 8596 CASE 8597 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8598 THEN concat( 8599 ';{nomen_field}=', 8600 dataframe_hgvs."{nomen_field}" 8601 ) 8602 ELSE '' 8603 END 8604 """ 8605 ) 8606 8607 # SQL set for update 8608 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8609 8610 # Update 8611 sql_update = f""" 8612 UPDATE variants 8613 SET "INFO" = 8614 concat( 8615 CASE 8616 WHEN "INFO" IS NULL 8617 THEN '' 8618 ELSE "INFO" 8619 END, 8620 {sql_nomen_fields_set} 8621 ) 8622 FROM dataframe_hgvs 8623 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8624 AND variants."POS" = dataframe_hgvs."POS" 8625 AND variants."REF" = dataframe_hgvs."REF" 8626 AND variants."ALT" = dataframe_hgvs."ALT" 8627 """ 8628 self.conn.execute(sql_update) 8629 8630 # Delete dataframe 8631 del dataframe_hgvs 8632 gc.collect() 8633 8634 # Remove added columns 8635 for added_column in added_columns: 8636 self.drop_column(column=added_column) 8637 8638 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8639 """ 8640 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8641 pipeline/sample for a variant and updates the variant information in a VCF file. 8642 8643 :param tag: The `tag` parameter is a string that represents the annotation field for the 8644 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8645 VCF header and to update the corresponding field in the variants table, defaults to 8646 findbypipeline 8647 :type tag: str (optional) 8648 """ 8649 8650 # if FORMAT and samples 8651 if ( 8652 "FORMAT" in self.get_header_columns_as_list() 8653 and self.get_header_sample_list() 8654 ): 8655 8656 # findbypipeline annotation field 8657 findbypipeline_tag = tag 8658 8659 # VCF infos tags 8660 vcf_infos_tags = { 8661 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8662 } 8663 8664 # Prefix 8665 prefix = self.get_explode_infos_prefix() 8666 8667 # Field 8668 findbypipeline_infos = prefix + findbypipeline_tag 8669 8670 # Variants table 8671 table_variants = self.get_table_variants() 8672 8673 # Header 8674 vcf_reader = self.get_header() 8675 8676 # Create variant id 8677 variant_id_column = self.get_variant_id_column() 8678 added_columns = [variant_id_column] 8679 8680 # variant_id, FORMAT and samples 8681 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8682 self.get_header_sample_list() 8683 ) 8684 8685 # Create dataframe 8686 dataframe_findbypipeline = self.get_query_to_df( 8687 f""" SELECT {samples_fields} FROM {table_variants} """ 8688 ) 8689 8690 # Create findbypipeline column 8691 dataframe_findbypipeline[findbypipeline_infos] = ( 8692 dataframe_findbypipeline.apply( 8693 lambda row: findbypipeline( 8694 row, samples=self.get_header_sample_list() 8695 ), 8696 axis=1, 8697 ) 8698 ) 8699 8700 # Add snpeff_hgvs to header 8701 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8702 findbypipeline_tag, 8703 ".", 8704 "String", 8705 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8706 "howard calculation", 8707 "0", 8708 self.code_type_map.get("String"), 8709 ) 8710 8711 # Update 8712 sql_update = f""" 8713 UPDATE variants 8714 SET "INFO" = 8715 concat( 8716 CASE 8717 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8718 THEN '' 8719 ELSE concat("INFO", ';') 8720 END, 8721 CASE 8722 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8723 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8724 THEN concat( 8725 '{findbypipeline_tag}=', 8726 dataframe_findbypipeline."{findbypipeline_infos}" 8727 ) 8728 ELSE '' 8729 END 8730 ) 8731 FROM dataframe_findbypipeline 8732 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8733 """ 8734 self.conn.execute(sql_update) 8735 8736 # Remove added columns 8737 for added_column in added_columns: 8738 self.drop_column(column=added_column) 8739 8740 # Delete dataframe 8741 del dataframe_findbypipeline 8742 gc.collect() 8743 8744 def calculation_genotype_concordance(self) -> None: 8745 """ 8746 The function `calculation_genotype_concordance` calculates the genotype concordance for 8747 multi-caller VCF files and updates the variant information in the database. 8748 """ 8749 8750 # if FORMAT and samples 8751 if ( 8752 "FORMAT" in self.get_header_columns_as_list() 8753 and self.get_header_sample_list() 8754 ): 8755 8756 # genotypeconcordance annotation field 8757 genotypeconcordance_tag = "genotypeconcordance" 8758 8759 # VCF infos tags 8760 vcf_infos_tags = { 8761 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8762 } 8763 8764 # Prefix 8765 prefix = self.get_explode_infos_prefix() 8766 8767 # Field 8768 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8769 8770 # Variants table 8771 table_variants = self.get_table_variants() 8772 8773 # Header 8774 vcf_reader = self.get_header() 8775 8776 # Create variant id 8777 variant_id_column = self.get_variant_id_column() 8778 added_columns = [variant_id_column] 8779 8780 # variant_id, FORMAT and samples 8781 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8782 self.get_header_sample_list() 8783 ) 8784 8785 # Create dataframe 8786 dataframe_genotypeconcordance = self.get_query_to_df( 8787 f""" SELECT {samples_fields} FROM {table_variants} """ 8788 ) 8789 8790 # Create genotypeconcordance column 8791 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8792 dataframe_genotypeconcordance.apply( 8793 lambda row: genotypeconcordance( 8794 row, samples=self.get_header_sample_list() 8795 ), 8796 axis=1, 8797 ) 8798 ) 8799 8800 # Add genotypeconcordance to header 8801 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8802 genotypeconcordance_tag, 8803 ".", 8804 "String", 8805 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8806 "howard calculation", 8807 "0", 8808 self.code_type_map.get("String"), 8809 ) 8810 8811 # Update 8812 sql_update = f""" 8813 UPDATE variants 8814 SET "INFO" = 8815 concat( 8816 CASE 8817 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8818 THEN '' 8819 ELSE concat("INFO", ';') 8820 END, 8821 CASE 8822 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8823 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8824 THEN concat( 8825 '{genotypeconcordance_tag}=', 8826 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8827 ) 8828 ELSE '' 8829 END 8830 ) 8831 FROM dataframe_genotypeconcordance 8832 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8833 """ 8834 self.conn.execute(sql_update) 8835 8836 # Remove added columns 8837 for added_column in added_columns: 8838 self.drop_column(column=added_column) 8839 8840 # Delete dataframe 8841 del dataframe_genotypeconcordance 8842 gc.collect() 8843 8844 def calculation_barcode(self, tag: str = "barcode") -> None: 8845 """ 8846 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8847 updates the INFO field in the file with the calculated barcode values. 8848 8849 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8850 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8851 the default tag name is set to "barcode", defaults to barcode 8852 :type tag: str (optional) 8853 """ 8854 8855 # if FORMAT and samples 8856 if ( 8857 "FORMAT" in self.get_header_columns_as_list() 8858 and self.get_header_sample_list() 8859 ): 8860 8861 # barcode annotation field 8862 if not tag: 8863 tag = "barcode" 8864 8865 # VCF infos tags 8866 vcf_infos_tags = { 8867 tag: "barcode calculation (VaRank)", 8868 } 8869 8870 # Prefix 8871 prefix = self.get_explode_infos_prefix() 8872 8873 # Field 8874 barcode_infos = prefix + tag 8875 8876 # Variants table 8877 table_variants = self.get_table_variants() 8878 8879 # Header 8880 vcf_reader = self.get_header() 8881 8882 # Create variant id 8883 variant_id_column = self.get_variant_id_column() 8884 added_columns = [variant_id_column] 8885 8886 # variant_id, FORMAT and samples 8887 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8888 self.get_header_sample_list() 8889 ) 8890 8891 # Create dataframe 8892 dataframe_barcode = self.get_query_to_df( 8893 f""" SELECT {samples_fields} FROM {table_variants} """ 8894 ) 8895 8896 # Create barcode column 8897 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8898 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8899 ) 8900 8901 # Add barcode to header 8902 vcf_reader.infos[tag] = vcf.parser._Info( 8903 tag, 8904 ".", 8905 "String", 8906 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8907 "howard calculation", 8908 "0", 8909 self.code_type_map.get("String"), 8910 ) 8911 8912 # Update 8913 sql_update = f""" 8914 UPDATE {table_variants} 8915 SET "INFO" = 8916 concat( 8917 CASE 8918 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8919 THEN '' 8920 ELSE concat("INFO", ';') 8921 END, 8922 CASE 8923 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8924 AND dataframe_barcode."{barcode_infos}" NOT NULL 8925 THEN concat( 8926 '{tag}=', 8927 dataframe_barcode."{barcode_infos}" 8928 ) 8929 ELSE '' 8930 END 8931 ) 8932 FROM dataframe_barcode 8933 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8934 """ 8935 self.conn.execute(sql_update) 8936 8937 # Remove added columns 8938 for added_column in added_columns: 8939 self.drop_column(column=added_column) 8940 8941 # Delete dataframe 8942 del dataframe_barcode 8943 gc.collect() 8944 8945 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8946 """ 8947 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8948 and updates the INFO field in the file with the calculated barcode values. 8949 8950 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8951 the barcode tag that will be added to the VCF file during the calculation process. If no value 8952 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8953 :type tag: str (optional) 8954 """ 8955 8956 # if FORMAT and samples 8957 if ( 8958 "FORMAT" in self.get_header_columns_as_list() 8959 and self.get_header_sample_list() 8960 ): 8961 8962 # barcode annotation field 8963 if not tag: 8964 tag = "BCF" 8965 8966 # VCF infos tags 8967 vcf_infos_tags = { 8968 tag: "barcode family calculation", 8969 f"{tag}S": "barcode family samples", 8970 } 8971 8972 # Param 8973 param = self.get_param() 8974 log.debug(f"param={param}") 8975 8976 # Prefix 8977 prefix = self.get_explode_infos_prefix() 8978 8979 # PED param 8980 ped = ( 8981 param.get("calculation", {}) 8982 .get("calculations", {}) 8983 .get("BARCODEFAMILY", {}) 8984 .get("family_pedigree", None) 8985 ) 8986 log.debug(f"ped={ped}") 8987 8988 # Load PED 8989 if ped: 8990 8991 # Pedigree is a file 8992 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8993 log.debug("Pedigree is file") 8994 with open(full_path(ped)) as ped: 8995 ped = json.load(ped) 8996 8997 # Pedigree is a string 8998 elif isinstance(ped, str): 8999 log.debug("Pedigree is str") 9000 try: 9001 ped = json.loads(ped) 9002 log.debug("Pedigree is json str") 9003 except ValueError as e: 9004 ped_samples = ped.split(",") 9005 ped = {} 9006 for ped_sample in ped_samples: 9007 ped[ped_sample] = ped_sample 9008 9009 # Pedigree is a dict 9010 elif isinstance(ped, dict): 9011 log.debug("Pedigree is dict") 9012 9013 # Pedigree is not well formatted 9014 else: 9015 msg_error = "Pedigree not well formatted" 9016 log.error(msg_error) 9017 raise ValueError(msg_error) 9018 9019 # Construct list 9020 ped_samples = list(ped.values()) 9021 9022 else: 9023 log.debug("Pedigree not defined. Take all samples") 9024 ped_samples = self.get_header_sample_list() 9025 ped = {} 9026 for ped_sample in ped_samples: 9027 ped[ped_sample] = ped_sample 9028 9029 # Check pedigree 9030 if not ped or len(ped) == 0: 9031 msg_error = f"Error in pedigree: samples {ped_samples}" 9032 log.error(msg_error) 9033 raise ValueError(msg_error) 9034 9035 # Log 9036 log.info( 9037 "Calculation 'BARCODEFAMILY' - Samples: " 9038 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9039 ) 9040 log.debug(f"ped_samples={ped_samples}") 9041 9042 # Field 9043 barcode_infos = prefix + tag 9044 9045 # Variants table 9046 table_variants = self.get_table_variants() 9047 9048 # Header 9049 vcf_reader = self.get_header() 9050 9051 # Create variant id 9052 variant_id_column = self.get_variant_id_column() 9053 added_columns = [variant_id_column] 9054 9055 # variant_id, FORMAT and samples 9056 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9057 ped_samples 9058 ) 9059 9060 # Create dataframe 9061 dataframe_barcode = self.get_query_to_df( 9062 f""" SELECT {samples_fields} FROM {table_variants} """ 9063 ) 9064 9065 # Create barcode column 9066 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9067 lambda row: barcode(row, samples=ped_samples), axis=1 9068 ) 9069 9070 # Add barcode family to header 9071 # Add vaf_normalization to header 9072 vcf_reader.formats[tag] = vcf.parser._Format( 9073 id=tag, 9074 num=".", 9075 type="String", 9076 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9077 type_code=self.code_type_map.get("String"), 9078 ) 9079 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9080 id=f"{tag}S", 9081 num=".", 9082 type="String", 9083 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9084 type_code=self.code_type_map.get("String"), 9085 ) 9086 9087 # Update 9088 # for sample in ped_samples: 9089 sql_update_set = [] 9090 for sample in self.get_header_sample_list() + ["FORMAT"]: 9091 if sample in ped_samples: 9092 value = f'dataframe_barcode."{barcode_infos}"' 9093 value_samples = "'" + ",".join(ped_samples) + "'" 9094 elif sample == "FORMAT": 9095 value = f"'{tag}'" 9096 value_samples = f"'{tag}S'" 9097 else: 9098 value = "'.'" 9099 value_samples = "'.'" 9100 format_regex = r"[a-zA-Z0-9\s]" 9101 sql_update_set.append( 9102 f""" 9103 "{sample}" = 9104 concat( 9105 CASE 9106 WHEN {table_variants}."{sample}" = './.' 9107 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9108 ELSE {table_variants}."{sample}" 9109 END, 9110 ':', 9111 {value}, 9112 ':', 9113 {value_samples} 9114 ) 9115 """ 9116 ) 9117 9118 sql_update_set_join = ", ".join(sql_update_set) 9119 sql_update = f""" 9120 UPDATE {table_variants} 9121 SET {sql_update_set_join} 9122 FROM dataframe_barcode 9123 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9124 """ 9125 self.conn.execute(sql_update) 9126 9127 # Remove added columns 9128 for added_column in added_columns: 9129 self.drop_column(column=added_column) 9130 9131 # Delete dataframe 9132 del dataframe_barcode 9133 gc.collect() 9134 9135 def calculation_trio(self) -> None: 9136 """ 9137 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9138 information to the INFO field of each variant. 9139 """ 9140 9141 # if FORMAT and samples 9142 if ( 9143 "FORMAT" in self.get_header_columns_as_list() 9144 and self.get_header_sample_list() 9145 ): 9146 9147 # trio annotation field 9148 trio_tag = "trio" 9149 9150 # VCF infos tags 9151 vcf_infos_tags = { 9152 "trio": "trio calculation", 9153 } 9154 9155 # Param 9156 param = self.get_param() 9157 9158 # Prefix 9159 prefix = self.get_explode_infos_prefix() 9160 9161 # Trio param 9162 trio_ped = ( 9163 param.get("calculation", {}) 9164 .get("calculations", {}) 9165 .get("TRIO", {}) 9166 .get("trio_pedigree", None) 9167 ) 9168 9169 # Load trio 9170 if trio_ped: 9171 9172 # Trio pedigree is a file 9173 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9174 log.debug("TRIO pedigree is file") 9175 with open(full_path(trio_ped)) as trio_ped: 9176 trio_ped = json.load(trio_ped) 9177 9178 # Trio pedigree is a string 9179 elif isinstance(trio_ped, str): 9180 log.debug("TRIO pedigree is str") 9181 try: 9182 trio_ped = json.loads(trio_ped) 9183 log.debug("TRIO pedigree is json str") 9184 except ValueError as e: 9185 trio_samples = trio_ped.split(",") 9186 if len(trio_samples) == 3: 9187 trio_ped = { 9188 "father": trio_samples[0], 9189 "mother": trio_samples[1], 9190 "child": trio_samples[2], 9191 } 9192 log.debug("TRIO pedigree is list str") 9193 else: 9194 msg_error = "TRIO pedigree not well formatted" 9195 log.error(msg_error) 9196 raise ValueError(msg_error) 9197 9198 # Trio pedigree is a dict 9199 elif isinstance(trio_ped, dict): 9200 log.debug("TRIO pedigree is dict") 9201 9202 # Trio pedigree is not well formatted 9203 else: 9204 msg_error = "TRIO pedigree not well formatted" 9205 log.error(msg_error) 9206 raise ValueError(msg_error) 9207 9208 # Construct trio list 9209 trio_samples = [ 9210 trio_ped.get("father", ""), 9211 trio_ped.get("mother", ""), 9212 trio_ped.get("child", ""), 9213 ] 9214 9215 else: 9216 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9217 samples_list = self.get_header_sample_list() 9218 if len(samples_list) >= 3: 9219 trio_samples = self.get_header_sample_list()[0:3] 9220 trio_ped = { 9221 "father": trio_samples[0], 9222 "mother": trio_samples[1], 9223 "child": trio_samples[2], 9224 } 9225 else: 9226 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9227 log.error(msg_error) 9228 raise ValueError(msg_error) 9229 9230 # Check trio pedigree 9231 if not trio_ped or len(trio_ped) != 3: 9232 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9233 log.error(msg_error) 9234 raise ValueError(msg_error) 9235 9236 # Log 9237 log.info( 9238 f"Calculation 'TRIO' - Samples: " 9239 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9240 ) 9241 9242 # Field 9243 trio_infos = prefix + trio_tag 9244 9245 # Variants table 9246 table_variants = self.get_table_variants() 9247 9248 # Header 9249 vcf_reader = self.get_header() 9250 9251 # Create variant id 9252 variant_id_column = self.get_variant_id_column() 9253 added_columns = [variant_id_column] 9254 9255 # variant_id, FORMAT and samples 9256 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9257 self.get_header_sample_list() 9258 ) 9259 9260 # Create dataframe 9261 dataframe_trio = self.get_query_to_df( 9262 f""" SELECT {samples_fields} FROM {table_variants} """ 9263 ) 9264 9265 # Create trio column 9266 dataframe_trio[trio_infos] = dataframe_trio.apply( 9267 lambda row: trio(row, samples=trio_samples), axis=1 9268 ) 9269 9270 # Add trio to header 9271 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9272 trio_tag, 9273 ".", 9274 "String", 9275 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9276 "howard calculation", 9277 "0", 9278 self.code_type_map.get("String"), 9279 ) 9280 9281 # Update 9282 sql_update = f""" 9283 UPDATE {table_variants} 9284 SET "INFO" = 9285 concat( 9286 CASE 9287 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9288 THEN '' 9289 ELSE concat("INFO", ';') 9290 END, 9291 CASE 9292 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9293 AND dataframe_trio."{trio_infos}" NOT NULL 9294 THEN concat( 9295 '{trio_tag}=', 9296 dataframe_trio."{trio_infos}" 9297 ) 9298 ELSE '' 9299 END 9300 ) 9301 FROM dataframe_trio 9302 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9303 """ 9304 self.conn.execute(sql_update) 9305 9306 # Remove added columns 9307 for added_column in added_columns: 9308 self.drop_column(column=added_column) 9309 9310 # Delete dataframe 9311 del dataframe_trio 9312 gc.collect() 9313 9314 def calculation_vaf_normalization(self) -> None: 9315 """ 9316 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9317 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9318 :return: The function does not return anything. 9319 """ 9320 9321 # if FORMAT and samples 9322 if ( 9323 "FORMAT" in self.get_header_columns_as_list() 9324 and self.get_header_sample_list() 9325 ): 9326 9327 # vaf_normalization annotation field 9328 vaf_normalization_tag = "VAF" 9329 9330 # VCF infos tags 9331 vcf_infos_tags = { 9332 "VAF": "VAF Variant Frequency", 9333 } 9334 9335 # Prefix 9336 prefix = self.get_explode_infos_prefix() 9337 9338 # Variants table 9339 table_variants = self.get_table_variants() 9340 9341 # Header 9342 vcf_reader = self.get_header() 9343 9344 # Do not calculate if VAF already exists 9345 if "VAF" in vcf_reader.formats: 9346 log.debug("VAF already on genotypes") 9347 return 9348 9349 # Create variant id 9350 variant_id_column = self.get_variant_id_column() 9351 added_columns = [variant_id_column] 9352 9353 # variant_id, FORMAT and samples 9354 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9355 f""" "{sample}" """ for sample in self.get_header_sample_list() 9356 ) 9357 9358 # Create dataframe 9359 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9360 log.debug(f"query={query}") 9361 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9362 9363 vaf_normalization_set = [] 9364 9365 # for each sample vaf_normalization 9366 for sample in self.get_header_sample_list(): 9367 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9368 lambda row: vaf_normalization(row, sample=sample), axis=1 9369 ) 9370 vaf_normalization_set.append( 9371 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9372 ) 9373 9374 # Add VAF to FORMAT 9375 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9376 "FORMAT" 9377 ].apply(lambda x: str(x) + ":VAF") 9378 vaf_normalization_set.append( 9379 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9380 ) 9381 9382 # Add vaf_normalization to header 9383 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9384 id=vaf_normalization_tag, 9385 num="1", 9386 type="Float", 9387 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9388 type_code=self.code_type_map.get("Float"), 9389 ) 9390 9391 # Create fields to add in INFO 9392 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9393 9394 # Update 9395 sql_update = f""" 9396 UPDATE {table_variants} 9397 SET {sql_vaf_normalization_set} 9398 FROM dataframe_vaf_normalization 9399 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9400 9401 """ 9402 self.conn.execute(sql_update) 9403 9404 # Remove added columns 9405 for added_column in added_columns: 9406 self.drop_column(column=added_column) 9407 9408 # Delete dataframe 9409 del dataframe_vaf_normalization 9410 gc.collect() 9411 9412 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9413 """ 9414 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9415 field in a VCF file and updates the INFO column of the variants table with the calculated 9416 statistics. 9417 9418 :param info: The `info` parameter is a string that represents the type of information for which 9419 genotype statistics are calculated. It is used to generate various VCF info tags for the 9420 statistics, such as the number of occurrences, the list of values, the minimum value, the 9421 maximum value, the mean, the median, defaults to VAF 9422 :type info: str (optional) 9423 """ 9424 9425 # if FORMAT and samples 9426 if ( 9427 "FORMAT" in self.get_header_columns_as_list() 9428 and self.get_header_sample_list() 9429 ): 9430 9431 # vaf_stats annotation field 9432 vaf_stats_tag = info + "_stats" 9433 9434 # VCF infos tags 9435 vcf_infos_tags = { 9436 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9437 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9438 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9439 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9440 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9441 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9442 info 9443 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9444 } 9445 9446 # Prefix 9447 prefix = self.get_explode_infos_prefix() 9448 9449 # Field 9450 vaf_stats_infos = prefix + vaf_stats_tag 9451 9452 # Variants table 9453 table_variants = self.get_table_variants() 9454 9455 # Header 9456 vcf_reader = self.get_header() 9457 9458 # Create variant id 9459 variant_id_column = self.get_variant_id_column() 9460 added_columns = [variant_id_column] 9461 9462 # variant_id, FORMAT and samples 9463 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9464 self.get_header_sample_list() 9465 ) 9466 9467 # Create dataframe 9468 dataframe_vaf_stats = self.get_query_to_df( 9469 f""" SELECT {samples_fields} FROM {table_variants} """ 9470 ) 9471 9472 # Create vaf_stats column 9473 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9474 lambda row: genotype_stats( 9475 row, samples=self.get_header_sample_list(), info=info 9476 ), 9477 axis=1, 9478 ) 9479 9480 # List of vcf tags 9481 sql_vaf_stats_fields = [] 9482 9483 # Check all VAF stats infos 9484 for stat in vcf_infos_tags: 9485 9486 # Extract stats 9487 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9488 lambda x: dict(x).get(stat, "") 9489 ) 9490 9491 # Add snpeff_hgvs to header 9492 vcf_reader.infos[stat] = vcf.parser._Info( 9493 stat, 9494 ".", 9495 "String", 9496 vcf_infos_tags.get(stat, "genotype statistics"), 9497 "howard calculation", 9498 "0", 9499 self.code_type_map.get("String"), 9500 ) 9501 9502 if len(sql_vaf_stats_fields): 9503 sep = ";" 9504 else: 9505 sep = "" 9506 9507 # Create fields to add in INFO 9508 sql_vaf_stats_fields.append( 9509 f""" 9510 CASE 9511 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9512 THEN concat( 9513 '{sep}{stat}=', 9514 dataframe_vaf_stats."{stat}" 9515 ) 9516 ELSE '' 9517 END 9518 """ 9519 ) 9520 9521 # SQL set for update 9522 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9523 9524 # Update 9525 sql_update = f""" 9526 UPDATE {table_variants} 9527 SET "INFO" = 9528 concat( 9529 CASE 9530 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9531 THEN '' 9532 ELSE concat("INFO", ';') 9533 END, 9534 {sql_vaf_stats_fields_set} 9535 ) 9536 FROM dataframe_vaf_stats 9537 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9538 9539 """ 9540 self.conn.execute(sql_update) 9541 9542 # Remove added columns 9543 for added_column in added_columns: 9544 self.drop_column(column=added_column) 9545 9546 # Delete dataframe 9547 del dataframe_vaf_stats 9548 gc.collect() 9549 9550 def calculation_transcripts_annotation( 9551 self, info_json: str = None, info_format: str = None 9552 ) -> None: 9553 """ 9554 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9555 field to it if transcripts are available. 9556 9557 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9558 is a string parameter that represents the information field to be used in the transcripts JSON. 9559 It is used to specify the JSON format for the transcripts information. If no value is provided 9560 when calling the method, it defaults to " 9561 :type info_json: str 9562 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9563 method is a string parameter that specifies the format of the information field to be used in 9564 the transcripts JSON. It is used to define the format of the information field 9565 :type info_format: str 9566 """ 9567 9568 # Create transcripts table 9569 transcripts_table = self.create_transcript_view() 9570 9571 # Add info field 9572 if transcripts_table: 9573 self.transcript_view_to_variants( 9574 transcripts_table=transcripts_table, 9575 transcripts_info_field_json=info_json, 9576 transcripts_info_field_format=info_format, 9577 ) 9578 else: 9579 log.info("No Transcripts to process. Check param.json file configuration") 9580 9581 def calculation_transcripts_prioritization(self) -> None: 9582 """ 9583 The function `calculation_transcripts_prioritization` creates a transcripts table and 9584 prioritizes transcripts based on certain criteria. 9585 """ 9586 9587 # Create transcripts table 9588 transcripts_table = self.create_transcript_view() 9589 9590 # Add info field 9591 if transcripts_table: 9592 self.transcripts_prioritization(transcripts_table=transcripts_table) 9593 else: 9594 log.info("No Transcripts to process. Check param.json file configuration") 9595 9596 ############### 9597 # Transcripts # 9598 ############### 9599 9600 def transcripts_prioritization( 9601 self, transcripts_table: str = None, param: dict = {} 9602 ) -> bool: 9603 """ 9604 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9605 and updates the variants table with the prioritized information. 9606 9607 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9608 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9609 This parameter is used to identify the table where the transcripts data is stored for the 9610 prioritization process 9611 :type transcripts_table: str 9612 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9613 that contains various configuration settings for the prioritization process of transcripts. It 9614 is used to customize the behavior of the prioritization algorithm and includes settings such as 9615 the prefix for prioritization fields, default profiles, and other 9616 :type param: dict 9617 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9618 transcripts prioritization process is successfully completed, and `False` if there are any 9619 issues or if no profile is defined for transcripts prioritization. 9620 """ 9621 9622 log.debug("Start transcripts prioritization...") 9623 9624 # Param 9625 if not param: 9626 param = self.get_param() 9627 9628 # Variants table 9629 table_variants = self.get_table_variants() 9630 log.debug(f"transcripts_table={transcripts_table}") 9631 # Transcripts table 9632 if transcripts_table is None: 9633 log.debug(f"transcripts_table={transcripts_table}") 9634 transcripts_table = self.create_transcript_view( 9635 transcripts_table="transcripts", param=param 9636 ) 9637 log.debug(f"transcripts_table={transcripts_table}") 9638 if transcripts_table is None: 9639 msg_err = "No Transcripts table availalble" 9640 log.error(msg_err) 9641 raise ValueError(msg_err) 9642 9643 # Get transcripts columns 9644 columns_as_list_query = f""" 9645 DESCRIBE {transcripts_table} 9646 """ 9647 columns_as_list = list( 9648 self.get_query_to_df(columns_as_list_query)["column_name"] 9649 ) 9650 9651 # Create INFO if not exists 9652 if "INFO" not in columns_as_list: 9653 query_add_info = f""" 9654 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9655 """ 9656 self.execute_query(query_add_info) 9657 9658 # Prioritization param and Force only PZ Score and Flag 9659 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9660 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9661 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9662 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9663 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9664 pz_profile_default = ( 9665 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9666 ) 9667 9668 # Exit if no profile 9669 if pz_profile_default is None: 9670 log.warning("No profile defined for transcripts prioritization") 9671 return False 9672 9673 # Prioritization 9674 prioritization_result = self.prioritization( 9675 table=transcripts_table, 9676 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9677 ) 9678 if not prioritization_result: 9679 log.warning("Transcripts prioritization not processed") 9680 return False 9681 9682 # Explode PZ fields 9683 self.explode_infos( 9684 table=transcripts_table, 9685 fields=param.get("transcripts", {}) 9686 .get("prioritization", {}) 9687 .get("pzfields", []), 9688 ) 9689 9690 # Export Transcripts prioritization infos to variants table 9691 query_update = f""" 9692 WITH RankedTranscripts AS ( 9693 SELECT 9694 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9695 ROW_NUMBER() OVER ( 9696 PARTITION BY "#CHROM", POS, REF, ALT 9697 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9698 ) AS rn 9699 FROM 9700 {transcripts_table} 9701 ) 9702 UPDATE {table_variants} 9703 SET 9704 INFO = CONCAT(CASE 9705 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9706 THEN '' 9707 ELSE concat("INFO", ';') 9708 END, 9709 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9710 ) 9711 FROM 9712 RankedTranscripts 9713 WHERE 9714 rn = 1 9715 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9716 AND variants."POS" = RankedTranscripts."POS" 9717 AND variants."REF" = RankedTranscripts."REF" 9718 AND variants."ALT" = RankedTranscripts."ALT" 9719 9720 """ 9721 self.execute_query(query=query_update) 9722 9723 # Add PZ Transcript in header 9724 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9725 pz_fields_transcripts, 9726 ".", 9727 "String", 9728 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9729 "unknown", 9730 "unknown", 9731 code_type_map["String"], 9732 ) 9733 9734 # Return 9735 return True 9736 9737 def create_transcript_view_from_columns_map( 9738 self, 9739 transcripts_table: str = "transcripts", 9740 columns_maps: dict = {}, 9741 added_columns: list = [], 9742 temporary_tables: list = None, 9743 annotation_fields: list = None, 9744 ) -> tuple[list, list, list]: 9745 """ 9746 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9747 specified columns mapping for transcripts data. 9748 9749 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9750 the table where the transcripts data is stored or will be stored in the database. This table 9751 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9752 predictions, etc. It defaults to "transcripts, defaults to transcripts 9753 :type transcripts_table: str (optional) 9754 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9755 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9756 represents a mapping configuration for a specific set of columns. It typically includes details such 9757 as the main transcript column and additional information columns 9758 :type columns_maps: dict 9759 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9760 function is a list that stores the additional columns that will be added to the view being created 9761 based on the columns map provided. These columns are generated by exploding the transcript 9762 information columns along with the main transcript column 9763 :type added_columns: list 9764 :param temporary_tables: The `temporary_tables` parameter in the 9765 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9766 tables created during the process of creating a transcript view from a columns map. These temporary 9767 tables are used to store intermediate results or transformations before the final view is generated 9768 :type temporary_tables: list 9769 :param annotation_fields: The `annotation_fields` parameter in the 9770 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9771 for annotation in the query view creation process. These fields are extracted from the 9772 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9773 :type annotation_fields: list 9774 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9775 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9776 """ 9777 9778 log.debug("Start transcrpts view creation from columns map...") 9779 9780 # "from_columns_map": [ 9781 # { 9782 # "transcripts_column": "Ensembl_transcriptid", 9783 # "transcripts_infos_columns": [ 9784 # "genename", 9785 # "Ensembl_geneid", 9786 # "LIST_S2_score", 9787 # "LIST_S2_pred", 9788 # ], 9789 # }, 9790 # { 9791 # "transcripts_column": "Ensembl_transcriptid", 9792 # "transcripts_infos_columns": [ 9793 # "genename", 9794 # "VARITY_R_score", 9795 # "Aloft_pred", 9796 # ], 9797 # }, 9798 # ], 9799 9800 # Init 9801 if temporary_tables is None: 9802 temporary_tables = [] 9803 if annotation_fields is None: 9804 annotation_fields = [] 9805 9806 # Variants table 9807 table_variants = self.get_table_variants() 9808 9809 for columns_map in columns_maps: 9810 9811 # Transcript column 9812 transcripts_column = columns_map.get("transcripts_column", None) 9813 9814 # Transcripts infos columns 9815 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9816 9817 if transcripts_column is not None: 9818 9819 # Explode 9820 added_columns += self.explode_infos( 9821 fields=[transcripts_column] + transcripts_infos_columns 9822 ) 9823 9824 # View clauses 9825 clause_select = [] 9826 for field in [transcripts_column] + transcripts_infos_columns: 9827 clause_select.append( 9828 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9829 ) 9830 if field not in [transcripts_column]: 9831 annotation_fields.append(field) 9832 9833 # Querey View 9834 query = f""" 9835 SELECT 9836 "#CHROM", POS, REF, ALT, INFO, 9837 "{transcripts_column}" AS 'transcript', 9838 {", ".join(clause_select)} 9839 FROM ( 9840 SELECT 9841 "#CHROM", POS, REF, ALT, INFO, 9842 {", ".join(clause_select)} 9843 FROM {table_variants} 9844 ) 9845 WHERE "{transcripts_column}" IS NOT NULL 9846 """ 9847 9848 # Create temporary table 9849 temporary_table = transcripts_table + "".join( 9850 random.choices(string.ascii_uppercase + string.digits, k=10) 9851 ) 9852 9853 # Temporary_tables 9854 temporary_tables.append(temporary_table) 9855 query_view = f""" 9856 CREATE TEMPORARY TABLE {temporary_table} 9857 AS ({query}) 9858 """ 9859 self.execute_query(query=query_view) 9860 9861 return added_columns, temporary_tables, annotation_fields 9862 9863 def create_transcript_view_from_column_format( 9864 self, 9865 transcripts_table: str = "transcripts", 9866 column_formats: dict = {}, 9867 temporary_tables: list = None, 9868 annotation_fields: list = None, 9869 ) -> tuple[list, list, list]: 9870 """ 9871 The `create_transcript_view_from_column_format` function generates a transcript view based on 9872 specified column formats, adds additional columns and annotation fields, and returns the list of 9873 temporary tables and annotation fields. 9874 9875 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9876 the table containing the transcripts data. This table will be used as the base table for creating 9877 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9878 different table name if needed, defaults to transcripts 9879 :type transcripts_table: str (optional) 9880 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9881 about the columns to be used for creating the transcript view. Each entry in the dictionary 9882 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9883 the provided code snippet: 9884 :type column_formats: dict 9885 :param temporary_tables: The `temporary_tables` parameter in the 9886 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9887 views created during the process of creating a transcript view from a column format. These temporary 9888 views are used to manipulate and extract data before generating the final transcript view. It 9889 :type temporary_tables: list 9890 :param annotation_fields: The `annotation_fields` parameter in the 9891 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9892 that are extracted from the temporary views created during the process. These annotation fields are 9893 obtained by querying the temporary views and extracting the column names excluding specific columns 9894 like `#CH 9895 :type annotation_fields: list 9896 :return: The `create_transcript_view_from_column_format` function returns two lists: 9897 `temporary_tables` and `annotation_fields`. 9898 """ 9899 9900 log.debug("Start transcrpts view creation from column format...") 9901 9902 # "from_column_format": [ 9903 # { 9904 # "transcripts_column": "ANN", 9905 # "transcripts_infos_column": "Feature_ID", 9906 # } 9907 # ], 9908 9909 # Init 9910 if temporary_tables is None: 9911 temporary_tables = [] 9912 if annotation_fields is None: 9913 annotation_fields = [] 9914 9915 for column_format in column_formats: 9916 9917 # annotation field and transcript annotation field 9918 annotation_field = column_format.get("transcripts_column", "ANN") 9919 transcript_annotation = column_format.get( 9920 "transcripts_infos_column", "Feature_ID" 9921 ) 9922 9923 # Temporary View name 9924 temporary_view_name = transcripts_table + "".join( 9925 random.choices(string.ascii_uppercase + string.digits, k=10) 9926 ) 9927 9928 # Create temporary view name 9929 temporary_view_name = self.annotation_format_to_table( 9930 uniquify=True, 9931 annotation_field=annotation_field, 9932 view_name=temporary_view_name, 9933 annotation_id=transcript_annotation, 9934 ) 9935 9936 # Annotation fields 9937 if temporary_view_name: 9938 query_annotation_fields = f""" 9939 SELECT * 9940 FROM ( 9941 DESCRIBE SELECT * 9942 FROM {temporary_view_name} 9943 ) 9944 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9945 """ 9946 df_annotation_fields = self.get_query_to_df( 9947 query=query_annotation_fields 9948 ) 9949 9950 # Add temporary view and annotation fields 9951 temporary_tables.append(temporary_view_name) 9952 annotation_fields += list(set(df_annotation_fields["column_name"])) 9953 9954 return temporary_tables, annotation_fields 9955 9956 def create_transcript_view( 9957 self, 9958 transcripts_table: str = None, 9959 transcripts_table_drop: bool = True, 9960 param: dict = {}, 9961 ) -> str: 9962 """ 9963 The `create_transcript_view` function generates a transcript view by processing data from a 9964 specified table based on provided parameters and structural information. 9965 9966 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9967 is used to specify the name of the table that will store the final transcript view data. If a table 9968 name is not provided, the function will create a new table to store the transcript view data, and by 9969 default,, defaults to transcripts 9970 :type transcripts_table: str (optional) 9971 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9972 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9973 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9974 the function will drop the existing transcripts table if it exists, defaults to True 9975 :type transcripts_table_drop: bool (optional) 9976 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9977 contains information needed to create a transcript view. It includes details such as the structure 9978 of the transcripts, columns mapping, column formats, and other necessary information for generating 9979 the view. This parameter allows for flexibility and customization 9980 :type param: dict 9981 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9982 created or modified during the execution of the function. 9983 """ 9984 9985 log.debug("Start transcripts view creation...") 9986 9987 # Default 9988 transcripts_table_default = "transcripts" 9989 9990 # Param 9991 if not param: 9992 param = self.get_param() 9993 9994 # Struct 9995 struct = param.get("transcripts", {}).get("struct", None) 9996 9997 if struct: 9998 9999 # Transcripts table 10000 if transcripts_table is None: 10001 transcripts_table = param.get("transcripts", {}).get( 10002 "table", transcripts_table_default 10003 ) 10004 10005 # added_columns 10006 added_columns = [] 10007 10008 # Temporary tables 10009 temporary_tables = [] 10010 10011 # Annotation fields 10012 annotation_fields = [] 10013 10014 # from columns map 10015 columns_maps = struct.get("from_columns_map", []) 10016 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10017 self.create_transcript_view_from_columns_map( 10018 transcripts_table=transcripts_table, 10019 columns_maps=columns_maps, 10020 added_columns=added_columns, 10021 temporary_tables=temporary_tables, 10022 annotation_fields=annotation_fields, 10023 ) 10024 ) 10025 added_columns += added_columns_tmp 10026 temporary_tables += temporary_tables_tmp 10027 annotation_fields += annotation_fields_tmp 10028 10029 # from column format 10030 column_formats = struct.get("from_column_format", []) 10031 temporary_tables_tmp, annotation_fields_tmp = ( 10032 self.create_transcript_view_from_column_format( 10033 transcripts_table=transcripts_table, 10034 column_formats=column_formats, 10035 temporary_tables=temporary_tables, 10036 annotation_fields=annotation_fields, 10037 ) 10038 ) 10039 temporary_tables += temporary_tables_tmp 10040 annotation_fields += annotation_fields_tmp 10041 10042 # Merge temporary tables query 10043 query_merge = "" 10044 for temporary_table in temporary_tables: 10045 10046 # First temporary table 10047 if not query_merge: 10048 query_merge = f""" 10049 SELECT * FROM {temporary_table} 10050 """ 10051 # other temporary table (using UNION) 10052 else: 10053 query_merge += f""" 10054 UNION BY NAME SELECT * FROM {temporary_table} 10055 """ 10056 10057 # Merge on transcript 10058 query_merge_on_transcripts_annotation_fields = [] 10059 # Aggregate all annotations fields 10060 for annotation_field in set(annotation_fields): 10061 query_merge_on_transcripts_annotation_fields.append( 10062 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10063 ) 10064 # Query for transcripts view 10065 query_merge_on_transcripts = f""" 10066 SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 10067 FROM ({query_merge}) 10068 GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript 10069 """ 10070 10071 # Drop transcript view is necessary 10072 if transcripts_table_drop: 10073 query_drop = f""" 10074 DROP TABLE IF EXISTS {transcripts_table}; 10075 """ 10076 self.execute_query(query=query_drop) 10077 10078 # Merge and create transcript view 10079 query_create_view = f""" 10080 CREATE TABLE IF NOT EXISTS {transcripts_table} 10081 AS {query_merge_on_transcripts} 10082 """ 10083 self.execute_query(query=query_create_view) 10084 10085 # Remove added columns 10086 for added_column in added_columns: 10087 self.drop_column(column=added_column) 10088 10089 else: 10090 10091 transcripts_table = None 10092 10093 return transcripts_table 10094 10095 def annotation_format_to_table( 10096 self, 10097 uniquify: bool = True, 10098 annotation_field: str = "ANN", 10099 annotation_id: str = "Feature_ID", 10100 view_name: str = "transcripts", 10101 ) -> str: 10102 """ 10103 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 10104 table format. 10105 10106 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 10107 values in the output or not. If set to `True`, the function will make sure that the output values 10108 are unique, defaults to True 10109 :type uniquify: bool (optional) 10110 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 10111 contains the annotation information for each variant. This field is used to extract the annotation 10112 details for further processing in the function, defaults to ANN 10113 :type annotation_field: str (optional) 10114 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 10115 used to specify the identifier for the annotation feature. This identifier will be used as a column 10116 name in the resulting table or view that is created based on the annotation data. It helps in 10117 uniquely identifying each annotation entry in the, defaults to Feature_ID 10118 :type annotation_id: str (optional) 10119 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 10120 specify the name of the temporary table that will be created to store the transformed annotation 10121 data. This table will hold the extracted information from the annotation field in a structured 10122 format for further processing or analysis, defaults to transcripts 10123 :type view_name: str (optional) 10124 :return: The function `annotation_format_to_table` is returning the name of the view created, which 10125 is stored in the variable `view_name`. 10126 """ 10127 10128 # Annotation field 10129 annotation_format = "annotation_explode" 10130 10131 # Transcript annotation 10132 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 10133 10134 # Prefix 10135 prefix = self.get_explode_infos_prefix() 10136 if prefix: 10137 prefix = "INFO/" 10138 10139 # Annotation fields 10140 annotation_infos = prefix + annotation_field 10141 annotation_format_infos = prefix + annotation_format 10142 10143 # Variants table 10144 table_variants = self.get_table_variants() 10145 10146 # Header 10147 vcf_reader = self.get_header() 10148 10149 # Add columns 10150 added_columns = [] 10151 10152 # Explode HGVS field in column 10153 added_columns += self.explode_infos(fields=[annotation_field]) 10154 10155 if annotation_field in vcf_reader.infos: 10156 10157 # Extract ANN header 10158 ann_description = vcf_reader.infos[annotation_field].desc 10159 pattern = r"'(.+?)'" 10160 match = re.search(pattern, ann_description) 10161 if match: 10162 ann_header_match = match.group(1).split(" | ") 10163 ann_header = [] 10164 ann_header_desc = {} 10165 for i in range(len(ann_header_match)): 10166 ann_header_info = "".join( 10167 char for char in ann_header_match[i] if char.isalnum() 10168 ) 10169 ann_header.append(ann_header_info) 10170 ann_header_desc[ann_header_info] = ann_header_match[i] 10171 if not ann_header_desc: 10172 raise ValueError("Invalid header description format") 10173 else: 10174 raise ValueError("Invalid header description format") 10175 10176 # Create variant id 10177 variant_id_column = self.get_variant_id_column() 10178 added_columns += [variant_id_column] 10179 10180 # Create dataframe 10181 dataframe_annotation_format = self.get_query_to_df( 10182 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 10183 ) 10184 10185 # Create annotation columns 10186 dataframe_annotation_format[ 10187 annotation_format_infos 10188 ] = dataframe_annotation_format[annotation_infos].apply( 10189 lambda x: explode_annotation_format( 10190 annotation=str(x), 10191 uniquify=uniquify, 10192 output_format="JSON", 10193 prefix="", 10194 header=list(ann_header_desc.values()), 10195 ) 10196 ) 10197 10198 # Find keys 10199 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 10200 df_keys = self.get_query_to_df(query=query_json) 10201 10202 # Check keys 10203 query_json_key = [] 10204 for _, row in df_keys.iterrows(): 10205 10206 # Key 10207 key = row.iloc[0] 10208 10209 # key_clean 10210 key_clean = "".join(char for char in key if char.isalnum()) 10211 10212 # Type 10213 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 10214 10215 # Get DataFrame from query 10216 df_json_type = self.get_query_to_df(query=query_json_type) 10217 10218 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 10219 with pd.option_context("future.no_silent_downcasting", True): 10220 df_json_type.fillna(value="", inplace=True) 10221 replace_dict = {None: np.nan, "": np.nan} 10222 df_json_type.replace(replace_dict, inplace=True) 10223 df_json_type.dropna(inplace=True) 10224 10225 # Detect column type 10226 column_type = detect_column_type(df_json_type[key_clean]) 10227 10228 # Append 10229 query_json_key.append( 10230 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 10231 ) 10232 10233 # Create view 10234 query_view = f""" 10235 CREATE TEMPORARY TABLE {view_name} 10236 AS ( 10237 SELECT *, {annotation_id} AS 'transcript' 10238 FROM ( 10239 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 10240 FROM dataframe_annotation_format 10241 ) 10242 ); 10243 """ 10244 self.execute_query(query=query_view) 10245 10246 else: 10247 10248 # Return None 10249 view_name = None 10250 10251 # Remove added columns 10252 for added_column in added_columns: 10253 self.drop_column(column=added_column) 10254 10255 return view_name 10256 10257 def transcript_view_to_variants( 10258 self, 10259 transcripts_table: str = None, 10260 transcripts_column_id: str = None, 10261 transcripts_info_json: str = None, 10262 transcripts_info_field_json: str = None, 10263 transcripts_info_format: str = None, 10264 transcripts_info_field_format: str = None, 10265 param: dict = {}, 10266 ) -> bool: 10267 """ 10268 The `transcript_view_to_variants` function updates a variants table with information from 10269 transcripts in JSON format. 10270 10271 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 10272 table containing the transcripts data. If this parameter is not provided, the function will 10273 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 10274 :type transcripts_table: str 10275 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 10276 column in the `transcripts_table` that contains the unique identifier for each transcript. This 10277 identifier is used to match transcripts with variants in the database 10278 :type transcripts_column_id: str 10279 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 10280 of the column in the variants table where the transcripts information will be stored in JSON 10281 format. This parameter allows you to define the column in the variants table that will hold the 10282 JSON-formatted information about transcripts 10283 :type transcripts_info_json: str 10284 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 10285 specify the field in the VCF header that will contain information about transcripts in JSON 10286 format. This field will be added to the VCF header as an INFO field with the specified name 10287 :type transcripts_info_field_json: str 10288 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 10289 format of the information about transcripts that will be stored in the variants table. This 10290 format can be used to define how the transcript information will be structured or displayed 10291 within the variants table 10292 :type transcripts_info_format: str 10293 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 10294 specify the field in the VCF header that will contain information about transcripts in a 10295 specific format. This field will be added to the VCF header as an INFO field with the specified 10296 name 10297 :type transcripts_info_field_format: str 10298 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 10299 that contains various configuration settings related to transcripts. It is used to provide 10300 default values for certain parameters if they are not explicitly provided when calling the 10301 method. The `param` dictionary can be passed as an argument 10302 :type param: dict 10303 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 10304 if the operation is successful and `False` if certain conditions are not met. 10305 """ 10306 10307 msg_info_prefix = "Start transcripts view to variants annotations" 10308 10309 log.debug(f"{msg_info_prefix}...") 10310 10311 # Default 10312 transcripts_table_default = "transcripts" 10313 transcripts_column_id_default = "transcript" 10314 transcripts_info_json_default = None 10315 transcripts_info_format_default = None 10316 transcripts_info_field_json_default = None 10317 transcripts_info_field_format_default = None 10318 10319 # Param 10320 if not param: 10321 param = self.get_param() 10322 10323 # Transcripts table 10324 if transcripts_table is None: 10325 transcripts_table = param.get("transcripts", {}).get( 10326 "table", transcripts_table_default 10327 ) 10328 10329 # Transcripts column ID 10330 if transcripts_column_id is None: 10331 transcripts_column_id = param.get("transcripts", {}).get( 10332 "column_id", transcripts_column_id_default 10333 ) 10334 10335 # Transcripts info json 10336 if transcripts_info_json is None: 10337 transcripts_info_json = param.get("transcripts", {}).get( 10338 "transcripts_info_json", transcripts_info_json_default 10339 ) 10340 10341 # Transcripts info field JSON 10342 if transcripts_info_field_json is None: 10343 transcripts_info_field_json = param.get("transcripts", {}).get( 10344 "transcripts_info_field_json", transcripts_info_field_json_default 10345 ) 10346 # if transcripts_info_field_json is not None and transcripts_info_json is None: 10347 # transcripts_info_json = transcripts_info_field_json 10348 10349 # Transcripts info format 10350 if transcripts_info_format is None: 10351 transcripts_info_format = param.get("transcripts", {}).get( 10352 "transcripts_info_format", transcripts_info_format_default 10353 ) 10354 10355 # Transcripts info field FORMAT 10356 if transcripts_info_field_format is None: 10357 transcripts_info_field_format = param.get("transcripts", {}).get( 10358 "transcripts_info_field_format", transcripts_info_field_format_default 10359 ) 10360 # if ( 10361 # transcripts_info_field_format is not None 10362 # and transcripts_info_format is None 10363 # ): 10364 # transcripts_info_format = transcripts_info_field_format 10365 10366 # Variants table 10367 table_variants = self.get_table_variants() 10368 10369 # Check info columns param 10370 if ( 10371 transcripts_info_json is None 10372 and transcripts_info_field_json is None 10373 and transcripts_info_format is None 10374 and transcripts_info_field_format is None 10375 ): 10376 return False 10377 10378 # Transcripts infos columns 10379 query_transcripts_infos_columns = f""" 10380 SELECT * 10381 FROM ( 10382 DESCRIBE SELECT * FROM {transcripts_table} 10383 ) 10384 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10385 """ 10386 transcripts_infos_columns = list( 10387 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10388 ) 10389 10390 # View results 10391 clause_select = [] 10392 clause_to_json = [] 10393 clause_to_format = [] 10394 for field in transcripts_infos_columns: 10395 clause_select.append( 10396 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10397 ) 10398 clause_to_json.append(f""" '{field}': "{field}" """) 10399 clause_to_format.append(f""" "{field}" """) 10400 10401 # Update 10402 update_set_json = [] 10403 update_set_format = [] 10404 10405 # VCF header 10406 vcf_reader = self.get_header() 10407 10408 # Transcripts to info column in JSON 10409 if transcripts_info_json is not None: 10410 10411 # Create column on variants table 10412 self.add_column( 10413 table_name=table_variants, 10414 column_name=transcripts_info_json, 10415 column_type="JSON", 10416 default_value=None, 10417 drop=False, 10418 ) 10419 10420 # Add header 10421 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10422 transcripts_info_json, 10423 ".", 10424 "String", 10425 "Transcripts in JSON format", 10426 "unknwon", 10427 "unknwon", 10428 self.code_type_map["String"], 10429 ) 10430 10431 # Add to update 10432 update_set_json.append( 10433 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10434 ) 10435 10436 # Transcripts to info field in JSON 10437 if transcripts_info_field_json is not None: 10438 10439 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 10440 10441 # Add to update 10442 update_set_json.append( 10443 f""" 10444 INFO = concat( 10445 CASE 10446 WHEN INFO NOT IN ('', '.') 10447 THEN INFO 10448 ELSE '' 10449 END, 10450 CASE 10451 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10452 THEN concat( 10453 ';{transcripts_info_field_json}=', 10454 t.{transcripts_info_json} 10455 ) 10456 ELSE '' 10457 END 10458 ) 10459 """ 10460 ) 10461 10462 # Add header 10463 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 10464 transcripts_info_field_json, 10465 ".", 10466 "String", 10467 "Transcripts in JSON format", 10468 "unknwon", 10469 "unknwon", 10470 self.code_type_map["String"], 10471 ) 10472 10473 if update_set_json: 10474 10475 # Update query 10476 query_update = f""" 10477 UPDATE {table_variants} 10478 SET {", ".join(update_set_json)} 10479 FROM 10480 ( 10481 SELECT 10482 "#CHROM", POS, REF, ALT, 10483 concat( 10484 '{{', 10485 string_agg( 10486 '"' || "{transcripts_column_id}" || '":' || 10487 to_json(json_output) 10488 ), 10489 '}}' 10490 )::JSON AS {transcripts_info_json} 10491 FROM 10492 ( 10493 SELECT 10494 "#CHROM", POS, REF, ALT, 10495 "{transcripts_column_id}", 10496 to_json( 10497 {{{",".join(clause_to_json)}}} 10498 )::JSON AS json_output 10499 FROM 10500 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10501 WHERE "{transcripts_column_id}" IS NOT NULL 10502 ) 10503 GROUP BY "#CHROM", POS, REF, ALT 10504 ) AS t 10505 WHERE {table_variants}."#CHROM" = t."#CHROM" 10506 AND {table_variants}."POS" = t."POS" 10507 AND {table_variants}."REF" = t."REF" 10508 AND {table_variants}."ALT" = t."ALT" 10509 """ 10510 10511 self.execute_query(query=query_update) 10512 10513 # Transcripts to info column in FORMAT 10514 if transcripts_info_format is not None: 10515 10516 # Create column on variants table 10517 self.add_column( 10518 table_name=table_variants, 10519 column_name=transcripts_info_format, 10520 column_type="VARCHAR", 10521 default_value=None, 10522 drop=False, 10523 ) 10524 10525 # Add header 10526 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 10527 transcripts_info_format, 10528 ".", 10529 "String", 10530 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10531 "unknwon", 10532 "unknwon", 10533 self.code_type_map["String"], 10534 ) 10535 10536 # Add to update 10537 update_set_format.append( 10538 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 10539 ) 10540 10541 # Transcripts to info field in JSON 10542 if transcripts_info_field_format is not None: 10543 10544 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 10545 10546 # Add to update 10547 update_set_format.append( 10548 f""" 10549 INFO = concat( 10550 CASE 10551 WHEN INFO NOT IN ('', '.') 10552 THEN INFO 10553 ELSE '' 10554 END, 10555 CASE 10556 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 10557 THEN concat( 10558 ';{transcripts_info_field_format}=', 10559 t.{transcripts_info_format} 10560 ) 10561 ELSE '' 10562 END 10563 ) 10564 """ 10565 ) 10566 10567 # Add header 10568 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 10569 transcripts_info_field_format, 10570 ".", 10571 "String", 10572 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10573 "unknwon", 10574 "unknwon", 10575 self.code_type_map["String"], 10576 ) 10577 10578 if update_set_format: 10579 10580 # Update query 10581 query_update = f""" 10582 UPDATE {table_variants} 10583 SET {", ".join(update_set_format)} 10584 FROM 10585 ( 10586 SELECT 10587 "#CHROM", POS, REF, ALT, 10588 string_agg({transcripts_info_format}) AS {transcripts_info_format} 10589 FROM 10590 ( 10591 SELECT 10592 "#CHROM", POS, REF, ALT, 10593 "{transcripts_column_id}", 10594 concat( 10595 "{transcripts_column_id}", 10596 '|', 10597 {", '|', ".join(clause_to_format)} 10598 ) AS {transcripts_info_format} 10599 FROM 10600 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10601 ) 10602 GROUP BY "#CHROM", POS, REF, ALT 10603 ) AS t 10604 WHERE {table_variants}."#CHROM" = t."#CHROM" 10605 AND {table_variants}."POS" = t."POS" 10606 AND {table_variants}."REF" = t."REF" 10607 AND {table_variants}."ALT" = t."ALT" 10608 """ 10609 10610 self.execute_query(query=query_update) 10611 10612 return True
36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Samples 78 self.set_samples() 79 80 # Load data 81 if load: 82 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
84 def set_samples(self, samples: list = None) -> list: 85 """ 86 The function `set_samples` sets the samples attribute of an object to a provided list or 87 retrieves it from a parameter dictionary. 88 89 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 90 input and sets the `samples` attribute of the class to the provided list. If no samples are 91 provided, it tries to get the samples from the class's parameters using the `get_param` method 92 :type samples: list 93 :return: The `samples` list is being returned. 94 """ 95 96 if not samples: 97 samples = self.get_param().get("samples", {}).get("list", None) 98 99 self.samples = samples 100 101 return samples
The function set_samples sets the samples attribute of an object to a provided list or
retrieves it from a parameter dictionary.
Parameters
- samples: The
set_samplesmethod is a method of a class that takes a list of samples as input and sets thesamplesattribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using theget_parammethod
Returns
The
sampleslist is being returned.
103 def get_samples(self) -> list: 104 """ 105 This function returns a list of samples. 106 :return: The `get_samples` method is returning the `samples` attribute of the object. 107 """ 108 109 return self.samples
This function returns a list of samples.
Returns
The
get_samplesmethod is returning thesamplesattribute of the object.
111 def get_samples_check(self) -> bool: 112 """ 113 This function returns the value of the "check" key within the "samples" dictionary retrieved 114 from the parameters. 115 :return: The method `get_samples_check` is returning the value of the key "check" inside the 116 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 117 method. If the key "check" is not found, it will return `False`. 118 """ 119 120 return self.get_param().get("samples", {}).get("check", True)
This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.
Returns
The method
get_samples_checkis returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by theget_param()method. If the key "check" is not found, it will returnFalse.
122 def set_input(self, input: str = None) -> None: 123 """ 124 The function `set_input` takes a file name as input, extracts the name and extension, and sets 125 attributes in the class accordingly. 126 127 :param input: The `set_input` method in the provided code snippet is used to set attributes 128 related to the input file. Here's a breakdown of the parameters and their usage in the method: 129 :type input: str 130 """ 131 132 if input and not isinstance(input, str): 133 try: 134 self.input = input.name 135 except: 136 log.error(f"Input file '{input} in bad format") 137 raise ValueError(f"Input file '{input} in bad format") 138 else: 139 self.input = input 140 141 # Input format 142 if input: 143 input_name, input_extension = os.path.splitext(self.input) 144 self.input_name = input_name 145 self.input_extension = input_extension 146 self.input_format = self.input_extension.replace(".", "")
The function set_input takes a file name as input, extracts the name and extension, and sets
attributes in the class accordingly.
Parameters
- input: The
set_inputmethod in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
148 def set_config(self, config: dict) -> None: 149 """ 150 The set_config function takes a config object and assigns it as the configuration object for the 151 class. 152 153 :param config: The `config` parameter in the `set_config` function is a dictionary object that 154 contains configuration settings for the class. When you call the `set_config` function with a 155 dictionary object as the argument, it will set that dictionary as the configuration object for 156 the class 157 :type config: dict 158 """ 159 160 self.config = config
The set_config function takes a config object and assigns it as the configuration object for the class.
Parameters
- config: The
configparameter in theset_configfunction is a dictionary object that contains configuration settings for the class. When you call theset_configfunction with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
162 def set_param(self, param: dict) -> None: 163 """ 164 This function sets a parameter object for the class based on the input dictionary. 165 166 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 167 as the `param` attribute of the class instance 168 :type param: dict 169 """ 170 171 self.param = param
This function sets a parameter object for the class based on the input dictionary.
Parameters
- param: The
set_parammethod you provided takes a dictionary object as input and sets it as theparamattribute of the class instance
173 def init_variables(self) -> None: 174 """ 175 This function initializes the variables that will be used in the rest of the class 176 """ 177 178 self.prefix = "howard" 179 self.table_variants = "variants" 180 self.dataframe = None 181 182 self.comparison_map = { 183 "gt": ">", 184 "gte": ">=", 185 "lt": "<", 186 "lte": "<=", 187 "equals": "=", 188 "contains": "SIMILAR TO", 189 } 190 191 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 192 193 self.code_type_map_to_sql = { 194 "Integer": "INTEGER", 195 "String": "VARCHAR", 196 "Float": "FLOAT", 197 "Flag": "VARCHAR", 198 } 199 200 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
202 def get_indexing(self) -> bool: 203 """ 204 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 205 returns False. 206 :return: The value of the indexing parameter. 207 """ 208 209 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
211 def get_connexion_config(self) -> dict: 212 """ 213 The function `get_connexion_config` returns a dictionary containing the configuration for a 214 connection, including the number of threads and memory limit. 215 :return: a dictionary containing the configuration for the Connexion library. 216 """ 217 218 # config 219 config = self.get_config() 220 221 # Connexion config 222 connexion_config = {} 223 threads = self.get_threads() 224 225 # Threads 226 if threads: 227 connexion_config["threads"] = threads 228 229 # Memory 230 # if config.get("memory", None): 231 # connexion_config["memory_limit"] = config.get("memory") 232 if self.get_memory(): 233 connexion_config["memory_limit"] = self.get_memory() 234 235 # Temporary directory 236 if config.get("tmp", None): 237 connexion_config["temp_directory"] = config.get("tmp") 238 239 # Access 240 if config.get("access", None): 241 access = config.get("access") 242 if access in ["RO"]: 243 access = "READ_ONLY" 244 elif access in ["RW"]: 245 access = "READ_WRITE" 246 connexion_db = self.get_connexion_db() 247 if connexion_db in ":memory:": 248 access = "READ_WRITE" 249 connexion_config["access_mode"] = access 250 251 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
253 def get_duckdb_settings(self) -> dict: 254 """ 255 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 256 string. 257 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 258 """ 259 260 # config 261 config = self.get_config() 262 263 # duckdb settings 264 duckdb_settings_dict = {} 265 if config.get("duckdb_settings", None): 266 duckdb_settings = config.get("duckdb_settings") 267 duckdb_settings = full_path(duckdb_settings) 268 # duckdb setting is a file 269 if os.path.exists(duckdb_settings): 270 with open(duckdb_settings) as json_file: 271 duckdb_settings_dict = yaml.safe_load(json_file) 272 # duckdb settings is a string 273 else: 274 duckdb_settings_dict = json.loads(duckdb_settings) 275 276 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
278 def set_connexion_db(self) -> str: 279 """ 280 The function `set_connexion_db` returns the appropriate database connection string based on the 281 input format and connection type. 282 :return: the value of the variable `connexion_db`. 283 """ 284 285 # Default connexion db 286 default_connexion_db = ":memory:" 287 288 # Find connexion db 289 if self.get_input_format() in ["db", "duckdb"]: 290 connexion_db = self.get_input() 291 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 292 connexion_db = default_connexion_db 293 elif self.get_connexion_type() in ["tmpfile"]: 294 tmp_name = tempfile.mkdtemp( 295 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 296 ) 297 connexion_db = f"{tmp_name}/tmp.db" 298 elif self.get_connexion_type() != "": 299 connexion_db = self.get_connexion_type() 300 else: 301 connexion_db = default_connexion_db 302 303 # Set connexion db 304 self.connexion_db = connexion_db 305 306 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
308 def set_connexion(self, conn) -> None: 309 """ 310 The function `set_connexion` creates a connection to a database, with options for different 311 database formats and settings. 312 313 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 314 database. If a connection is not provided, a new connection to an in-memory database is created. 315 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 316 sqlite 317 """ 318 319 # Connexion db 320 connexion_db = self.set_connexion_db() 321 322 # Connexion config 323 connexion_config = self.get_connexion_config() 324 325 # Connexion format 326 connexion_format = self.get_config().get("connexion_format", "duckdb") 327 # Set connexion format 328 self.connexion_format = connexion_format 329 330 # Connexion 331 if not conn: 332 if connexion_format in ["duckdb"]: 333 conn = duckdb.connect(connexion_db, config=connexion_config) 334 # duckDB settings 335 duckdb_settings = self.get_duckdb_settings() 336 if duckdb_settings: 337 for setting in duckdb_settings: 338 setting_value = duckdb_settings.get(setting) 339 if isinstance(setting_value, str): 340 setting_value = f"'{setting_value}'" 341 conn.execute(f"PRAGMA {setting}={setting_value};") 342 elif connexion_format in ["sqlite"]: 343 conn = sqlite3.connect(connexion_db) 344 345 # Set connexion 346 self.conn = conn 347 348 # Log 349 log.debug(f"connexion_format: {connexion_format}") 350 log.debug(f"connexion_db: {connexion_db}") 351 log.debug(f"connexion config: {connexion_config}") 352 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
The function set_connexion creates a connection to a database, with options for different
database formats and settings.
Parameters
- conn: The
connparameter in theset_connexionmethod is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
354 def set_output(self, output: str = None) -> None: 355 """ 356 The `set_output` function in Python sets the output file based on the input or a specified key 357 in the config file, extracting the output name, extension, and format. 358 359 :param output: The `output` parameter in the `set_output` method is used to specify the name of 360 the output file. If the config file has an 'output' key, the method sets the output to the value 361 of that key. If no output is provided, it sets the output to `None` 362 :type output: str 363 """ 364 365 if output and not isinstance(output, str): 366 self.output = output.name 367 else: 368 self.output = output 369 370 # Output format 371 if self.output: 372 output_name, output_extension = os.path.splitext(self.output) 373 self.output_name = output_name 374 self.output_extension = output_extension 375 self.output_format = self.output_extension.replace(".", "") 376 else: 377 self.output_name = None 378 self.output_extension = None 379 self.output_format = None
The set_output function in Python sets the output file based on the input or a specified key
in the config file, extracting the output name, extension, and format.
Parameters
- output: The
outputparameter in theset_outputmethod is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output toNone
381 def set_header(self) -> None: 382 """ 383 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 384 """ 385 386 input_file = self.get_input() 387 default_header_list = [ 388 "##fileformat=VCFv4.2", 389 "#CHROM POS ID REF ALT QUAL FILTER INFO", 390 ] 391 392 # Full path 393 input_file = full_path(input_file) 394 395 if input_file: 396 397 input_format = self.get_input_format() 398 input_compressed = self.get_input_compressed() 399 config = self.get_config() 400 header_list = default_header_list 401 if input_format in [ 402 "vcf", 403 "hdr", 404 "tsv", 405 "csv", 406 "psv", 407 "parquet", 408 "db", 409 "duckdb", 410 ]: 411 # header provided in param 412 if config.get("header_file", None): 413 with open(config.get("header_file"), "rt") as f: 414 header_list = self.read_vcf_header(f) 415 # within a vcf file format (header within input file itsself) 416 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 417 # within a compressed vcf file format (.vcf.gz) 418 if input_compressed: 419 with bgzf.open(input_file, "rt") as f: 420 header_list = self.read_vcf_header(f) 421 # within an uncompressed vcf file format (.vcf) 422 else: 423 with open(input_file, "rt") as f: 424 header_list = self.read_vcf_header(f) 425 # header provided in default external file .hdr 426 elif os.path.exists((input_file + ".hdr")): 427 with open(input_file + ".hdr", "rt") as f: 428 header_list = self.read_vcf_header(f) 429 else: 430 try: # Try to get header info fields and file columns 431 432 with tempfile.TemporaryDirectory() as tmpdir: 433 434 # Create database 435 db_for_header = Database(database=input_file) 436 437 # Get header columns for infos fields 438 db_header_from_columns = ( 439 db_for_header.get_header_from_columns() 440 ) 441 442 # Get real columns in the file 443 db_header_columns = db_for_header.get_columns() 444 445 # Write header file 446 header_file_tmp = os.path.join(tmpdir, "header") 447 f = open(header_file_tmp, "w") 448 vcf.Writer(f, db_header_from_columns) 449 f.close() 450 451 # Replace #CHROM line with rel columns 452 header_list = db_for_header.read_header_file( 453 header_file=header_file_tmp 454 ) 455 header_list[-1] = "\t".join(db_header_columns) 456 457 except: 458 459 log.warning( 460 f"No header for file {input_file}. Set as default VCF header" 461 ) 462 header_list = default_header_list 463 464 else: # try for unknown format ? 465 466 log.error(f"Input file format '{input_format}' not available") 467 raise ValueError(f"Input file format '{input_format}' not available") 468 469 if not header_list: 470 header_list = default_header_list 471 472 # header as list 473 self.header_list = header_list 474 475 # header as VCF object 476 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 477 478 else: 479 480 self.header_list = None 481 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
483 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 484 """ 485 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 486 DataFrame based on the connection format. 487 488 :param query: The `query` parameter in the `get_query_to_df` function is a string that 489 represents the SQL query you want to execute. This query will be used to fetch data from a 490 database and convert it into a pandas DataFrame 491 :type query: str 492 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 493 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 494 function will only fetch up to that number of rows from the database query result. If no limit 495 is specified, 496 :type limit: int 497 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 498 """ 499 500 # Connexion format 501 connexion_format = self.get_connexion_format() 502 503 # Limit in query 504 if limit: 505 pd.set_option("display.max_rows", limit) 506 if connexion_format in ["duckdb"]: 507 df = ( 508 self.conn.execute(query) 509 .fetch_record_batch(limit) 510 .read_next_batch() 511 .to_pandas() 512 ) 513 elif connexion_format in ["sqlite"]: 514 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 515 516 # Full query 517 else: 518 if connexion_format in ["duckdb"]: 519 df = self.conn.execute(query).df() 520 elif connexion_format in ["sqlite"]: 521 df = pd.read_sql_query(query, self.conn) 522 523 return df
The get_query_to_df function takes a query as a string and returns the result as a pandas
DataFrame based on the connection format.
Parameters
- query: The
queryparameter in theget_query_to_dffunction is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame - limit: The
limitparameter in theget_query_to_dffunction is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns
A pandas DataFrame is being returned by the
get_query_to_dffunction.
525 def get_overview(self) -> None: 526 """ 527 The function prints the input, output, config, and dataframe of the current object 528 """ 529 table_variants_from = self.get_table_variants(clause="from") 530 sql_columns = self.get_header_columns_as_sql() 531 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 532 df = self.get_query_to_df(sql_query_export) 533 log.info( 534 "Input: " 535 + str(self.get_input()) 536 + " [" 537 + str(str(self.get_input_format())) 538 + "]" 539 ) 540 log.info( 541 "Output: " 542 + str(self.get_output()) 543 + " [" 544 + str(str(self.get_output_format())) 545 + "]" 546 ) 547 log.info("Config: ") 548 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 549 "\n" 550 ): 551 log.info("\t" + str(d)) 552 log.info("Param: ") 553 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 554 "\n" 555 ): 556 log.info("\t" + str(d)) 557 log.info("Sample list: " + str(self.get_header_sample_list())) 558 log.info("Dataframe: ") 559 for d in str(df).split("\n"): 560 log.info("\t" + str(d)) 561 562 # garbage collector 563 del df 564 gc.collect() 565 566 return None
The function prints the input, output, config, and dataframe of the current object
568 def get_stats(self) -> dict: 569 """ 570 The `get_stats` function calculates and returns various statistics of the current object, 571 including information about the input file, variants, samples, header fields, quality, and 572 SNVs/InDels. 573 :return: a dictionary containing various statistics of the current object. The dictionary has 574 the following structure: 575 """ 576 577 # Log 578 log.info(f"Stats Calculation...") 579 580 # table varaints 581 table_variants_from = self.get_table_variants() 582 583 # stats dict 584 stats = {"Infos": {}} 585 586 ### File 587 input_file = self.get_input() 588 stats["Infos"]["Input file"] = input_file 589 590 # Header 591 header_infos = self.get_header().infos 592 header_formats = self.get_header().formats 593 header_infos_list = list(header_infos) 594 header_formats_list = list(header_formats) 595 596 ### Variants 597 598 stats["Variants"] = {} 599 600 # Variants by chr 601 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 602 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 603 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 604 by=["CHROM"], kind="quicksort" 605 ) 606 607 # Total number of variants 608 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 609 610 # Calculate percentage 611 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 612 lambda x: (x / nb_of_variants) 613 ) 614 615 stats["Variants"]["Number of variants by chromosome"] = ( 616 nb_of_variants_by_chrom.to_dict(orient="index") 617 ) 618 619 stats["Infos"]["Number of variants"] = int(nb_of_variants) 620 621 ### Samples 622 623 # Init 624 samples = {} 625 nb_of_samples = 0 626 627 # Check Samples 628 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 629 log.debug(f"Check samples...") 630 for sample in self.get_header_sample_list(): 631 sql_query_samples = f""" 632 SELECT '{sample}' as sample, 633 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 634 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 635 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 636 FROM {table_variants_from} 637 WHERE ( 638 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 639 AND 640 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 641 ) 642 GROUP BY genotype 643 """ 644 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 645 sample_genotype_count = sql_query_genotype_df["count"].sum() 646 if len(sql_query_genotype_df): 647 nb_of_samples += 1 648 samples[f"{sample} - {sample_genotype_count} variants"] = ( 649 sql_query_genotype_df.to_dict(orient="index") 650 ) 651 652 stats["Samples"] = samples 653 stats["Infos"]["Number of samples"] = nb_of_samples 654 655 # # 656 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 657 # stats["Infos"]["Number of samples"] = nb_of_samples 658 # elif nb_of_samples: 659 # stats["Infos"]["Number of samples"] = "not a VCF format" 660 661 ### INFO and FORMAT fields 662 header_types_df = {} 663 header_types_list = { 664 "List of INFO fields": header_infos, 665 "List of FORMAT fields": header_formats, 666 } 667 i = 0 668 for header_type in header_types_list: 669 670 header_type_infos = header_types_list.get(header_type) 671 header_infos_dict = {} 672 673 for info in header_type_infos: 674 675 i += 1 676 header_infos_dict[i] = {} 677 678 # ID 679 header_infos_dict[i]["id"] = info 680 681 # num 682 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 683 if header_type_infos[info].num in genotype_map.keys(): 684 header_infos_dict[i]["Number"] = genotype_map.get( 685 header_type_infos[info].num 686 ) 687 else: 688 header_infos_dict[i]["Number"] = header_type_infos[info].num 689 690 # type 691 if header_type_infos[info].type: 692 header_infos_dict[i]["Type"] = header_type_infos[info].type 693 else: 694 header_infos_dict[i]["Type"] = "." 695 696 # desc 697 if header_type_infos[info].desc != None: 698 header_infos_dict[i]["Description"] = header_type_infos[info].desc 699 else: 700 header_infos_dict[i]["Description"] = "" 701 702 if len(header_infos_dict): 703 header_types_df[header_type] = pd.DataFrame.from_dict( 704 header_infos_dict, orient="index" 705 ).to_dict(orient="index") 706 707 # Stats 708 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 709 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 710 stats["Header"] = header_types_df 711 712 ### QUAL 713 if "QUAL" in self.get_header_columns(): 714 sql_query_qual = f""" 715 SELECT 716 avg(CAST(QUAL AS INTEGER)) AS Average, 717 min(CAST(QUAL AS INTEGER)) AS Minimum, 718 max(CAST(QUAL AS INTEGER)) AS Maximum, 719 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 720 median(CAST(QUAL AS INTEGER)) AS Median, 721 variance(CAST(QUAL AS INTEGER)) AS Variance 722 FROM {table_variants_from} 723 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 724 """ 725 726 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 727 stats["Quality"] = {"Stats": qual} 728 729 ### SNV and InDel 730 731 sql_query_snv = f""" 732 733 SELECT Type, count FROM ( 734 735 SELECT 736 'Total' AS Type, 737 count(*) AS count 738 FROM {table_variants_from} 739 740 UNION 741 742 SELECT 743 'MNV' AS Type, 744 count(*) AS count 745 FROM {table_variants_from} 746 WHERE len(REF) > 1 AND len(ALT) > 1 747 AND len(REF) = len(ALT) 748 749 UNION 750 751 SELECT 752 'InDel' AS Type, 753 count(*) AS count 754 FROM {table_variants_from} 755 WHERE len(REF) > 1 OR len(ALT) > 1 756 AND len(REF) != len(ALT) 757 758 UNION 759 760 SELECT 761 'SNV' AS Type, 762 count(*) AS count 763 FROM {table_variants_from} 764 WHERE len(REF) = 1 AND len(ALT) = 1 765 766 ) 767 768 ORDER BY count DESC 769 770 """ 771 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 772 773 sql_query_snv_substitution = f""" 774 SELECT 775 concat(REF, '>', ALT) AS 'Substitution', 776 count(*) AS count 777 FROM {table_variants_from} 778 WHERE len(REF) = 1 AND len(ALT) = 1 779 GROUP BY REF, ALT 780 ORDER BY count(*) DESC 781 """ 782 snv_substitution = ( 783 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 784 ) 785 stats["Variants"]["Counts"] = snv_indel 786 stats["Variants"]["Substitutions"] = snv_substitution 787 788 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
790 def stats_to_file(self, file: str = None) -> str: 791 """ 792 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 793 into a JSON object, and writes the JSON object to the specified file. 794 795 :param file: The `file` parameter is a string that represents the file path where the JSON data 796 will be written 797 :type file: str 798 :return: the name of the file that was written to. 799 """ 800 801 # Get stats 802 stats = self.get_stats() 803 804 # Serializing json 805 json_object = json.dumps(stats, indent=4) 806 807 # Writing to sample.json 808 with open(file, "w") as outfile: 809 outfile.write(json_object) 810 811 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
813 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 814 """ 815 The `print_stats` function generates a markdown file and prints the statistics contained in a 816 JSON file in a formatted manner. 817 818 :param output_file: The `output_file` parameter is a string that specifies the path and filename 819 of the output file where the stats will be printed in Markdown format. If no `output_file` is 820 provided, a temporary directory will be created and the stats will be saved in a file named 821 "stats.md" within that 822 :type output_file: str 823 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 824 file where the statistics will be saved. If no value is provided, a temporary directory will be 825 created and a default file name "stats.json" will be used 826 :type json_file: str 827 :return: The function `print_stats` does not return any value. It has a return type annotation 828 of `None`. 829 """ 830 831 # Full path 832 output_file = full_path(output_file) 833 json_file = full_path(json_file) 834 835 with tempfile.TemporaryDirectory() as tmpdir: 836 837 # Files 838 if not output_file: 839 output_file = os.path.join(tmpdir, "stats.md") 840 if not json_file: 841 json_file = os.path.join(tmpdir, "stats.json") 842 843 # Create folders 844 if not os.path.exists(os.path.dirname(output_file)): 845 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 846 if not os.path.exists(os.path.dirname(json_file)): 847 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 848 849 # Create stats JSON file 850 stats_file = self.stats_to_file(file=json_file) 851 852 # Print stats file 853 with open(stats_file) as f: 854 stats = yaml.safe_load(f) 855 856 # Output 857 output_title = [] 858 output_index = [] 859 output = [] 860 861 # Title 862 output_title.append("# HOWARD Stats") 863 864 # Index 865 output_index.append("## Index") 866 867 # Process sections 868 for section in stats: 869 infos = stats.get(section) 870 section_link = "#" + section.lower().replace(" ", "-") 871 output.append(f"## {section}") 872 output_index.append(f"- [{section}]({section_link})") 873 874 if len(infos): 875 for info in infos: 876 try: 877 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 878 is_df = True 879 except: 880 try: 881 df = pd.DataFrame.from_dict( 882 json.loads((infos.get(info))), orient="index" 883 ) 884 is_df = True 885 except: 886 is_df = False 887 if is_df: 888 output.append(f"### {info}") 889 info_link = "#" + info.lower().replace(" ", "-") 890 output_index.append(f" - [{info}]({info_link})") 891 output.append(f"{df.to_markdown(index=False)}") 892 else: 893 output.append(f"- {info}: {infos.get(info)}") 894 else: 895 output.append(f"NA") 896 897 # Write stats in markdown file 898 with open(output_file, "w") as fp: 899 for item in output_title: 900 fp.write("%s\n" % item) 901 for item in output_index: 902 fp.write("%s\n" % item) 903 for item in output: 904 fp.write("%s\n" % item) 905 906 # Output stats in markdown 907 print("") 908 print("\n\n".join(output_title)) 909 print("") 910 print("\n\n".join(output)) 911 print("") 912 913 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
915 def get_input(self) -> str: 916 """ 917 It returns the value of the input variable. 918 :return: The input is being returned. 919 """ 920 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
922 def get_input_format(self, input_file: str = None) -> str: 923 """ 924 This function returns the format of the input variable, either from the provided input file or 925 by prompting for input. 926 927 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 928 represents the file path of the input file. If no `input_file` is provided when calling the 929 method, it will default to `None` 930 :type input_file: str 931 :return: The format of the input variable is being returned. 932 """ 933 934 if not input_file: 935 input_file = self.get_input() 936 input_format = get_file_format(input_file) 937 return input_format
This function returns the format of the input variable, either from the provided input file or by prompting for input.
Parameters
- input_file: The
input_fileparameter in theget_input_formatmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNone
Returns
The format of the input variable is being returned.
939 def get_input_compressed(self, input_file: str = None) -> str: 940 """ 941 The function `get_input_compressed` returns the format of the input variable after compressing 942 it. 943 944 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 945 that represents the file path of the input file. If no `input_file` is provided when calling the 946 method, it will default to `None` and the method will then call `self.get_input()` to 947 :type input_file: str 948 :return: The function `get_input_compressed` returns the compressed format of the input 949 variable. 950 """ 951 952 if not input_file: 953 input_file = self.get_input() 954 input_compressed = get_file_compressed(input_file) 955 return input_compressed
The function get_input_compressed returns the format of the input variable after compressing
it.
Parameters
- input_file: The
input_fileparameter in theget_input_compressedmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNoneand the method will then callself.get_input()to
Returns
The function
get_input_compressedreturns the compressed format of the input variable.
957 def get_output(self) -> str: 958 """ 959 It returns the output of the neuron. 960 :return: The output of the neural network. 961 """ 962 963 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
965 def get_output_format(self, output_file: str = None) -> str: 966 """ 967 The function `get_output_format` returns the format of the input variable or the output file if 968 provided. 969 970 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 971 that represents the file path of the output file. If no `output_file` is provided when calling 972 the method, it will default to the output obtained from the `get_output` method of the class 973 instance. The 974 :type output_file: str 975 :return: The format of the input variable is being returned. 976 """ 977 978 if not output_file: 979 output_file = self.get_output() 980 output_format = get_file_format(output_file) 981 982 return output_format
The function get_output_format returns the format of the input variable or the output file if
provided.
Parameters
- output_file: The
output_fileparameter in theget_output_formatmethod is a string that represents the file path of the output file. If nooutput_fileis provided when calling the method, it will default to the output obtained from theget_outputmethod of the class instance. The
Returns
The format of the input variable is being returned.
984 def get_config(self) -> dict: 985 """ 986 It returns the config 987 :return: The config variable is being returned. 988 """ 989 return self.config
It returns the config
Returns
The config variable is being returned.
991 def get_param(self) -> dict: 992 """ 993 It returns the param 994 :return: The param variable is being returned. 995 """ 996 return self.param
It returns the param
Returns
The param variable is being returned.
998 def get_connexion_db(self) -> str: 999 """ 1000 It returns the connexion_db attribute of the object 1001 :return: The connexion_db is being returned. 1002 """ 1003 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
1005 def get_prefix(self) -> str: 1006 """ 1007 It returns the prefix of the object. 1008 :return: The prefix is being returned. 1009 """ 1010 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
1012 def get_table_variants(self, clause: str = "select") -> str: 1013 """ 1014 This function returns the table_variants attribute of the object 1015 1016 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1017 defaults to select (optional) 1018 :return: The table_variants attribute of the object. 1019 """ 1020 1021 # Access 1022 access = self.get_config().get("access", None) 1023 1024 # Clauses "select", "where", "update" 1025 if clause in ["select", "where", "update"]: 1026 table_variants = self.table_variants 1027 # Clause "from" 1028 elif clause in ["from"]: 1029 # For Read Only 1030 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1031 input_file = self.get_input() 1032 table_variants = f"'{input_file}' as variants" 1033 # For Read Write 1034 else: 1035 table_variants = f"{self.table_variants} as variants" 1036 else: 1037 table_variants = self.table_variants 1038 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
1040 def get_tmp_dir(self) -> str: 1041 """ 1042 The function `get_tmp_dir` returns the temporary directory path based on configuration 1043 parameters or a default path. 1044 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1045 configuration, parameters, and a default value of "/tmp". 1046 """ 1047 1048 return get_tmp( 1049 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1050 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
1052 def get_connexion_type(self) -> str: 1053 """ 1054 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1055 1056 :return: The connexion type is being returned. 1057 """ 1058 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
1060 def get_connexion(self): 1061 """ 1062 It returns the connection object 1063 1064 :return: The connection object. 1065 """ 1066 return self.conn
It returns the connection object
Returns
The connection object.
1068 def close_connexion(self) -> None: 1069 """ 1070 This function closes the connection to the database. 1071 :return: The connection is being closed. 1072 """ 1073 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
1075 def get_header(self, type: str = "vcf"): 1076 """ 1077 This function returns the header of the VCF file as a list of strings 1078 1079 :param type: the type of header you want to get, defaults to vcf (optional) 1080 :return: The header of the vcf file. 1081 """ 1082 1083 if self.header_vcf: 1084 if type == "vcf": 1085 return self.header_vcf 1086 elif type == "list": 1087 return self.header_list 1088 else: 1089 if type == "vcf": 1090 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1091 return header 1092 elif type == "list": 1093 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1095 def get_header_length(self, file: str = None) -> int: 1096 """ 1097 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1098 line. 1099 1100 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1101 header file. If this argument is provided, the function will read the header from the specified 1102 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1103 :type file: str 1104 :return: the length of the header list, excluding the #CHROM line. 1105 """ 1106 1107 if file: 1108 return len(self.read_vcf_header_file(file=file)) - 1 1109 elif self.get_header(type="list"): 1110 return len(self.get_header(type="list")) - 1 1111 else: 1112 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1114 def get_header_columns(self) -> str: 1115 """ 1116 This function returns the header list of a VCF 1117 1118 :return: The length of the header list. 1119 """ 1120 if self.get_header(): 1121 return self.get_header(type="list")[-1] 1122 else: 1123 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1125 def get_header_columns_as_list(self) -> list: 1126 """ 1127 This function returns the header list of a VCF 1128 1129 :return: The length of the header list. 1130 """ 1131 if self.get_header(): 1132 return self.get_header_columns().strip().split("\t") 1133 else: 1134 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1136 def get_header_columns_as_sql(self) -> str: 1137 """ 1138 This function retruns header length (without #CHROM line) 1139 1140 :return: The length of the header list. 1141 """ 1142 sql_column_list = [] 1143 for col in self.get_header_columns_as_list(): 1144 sql_column_list.append(f'"{col}"') 1145 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1147 def get_header_sample_list( 1148 self, check: bool = False, samples: list = None, samples_force: bool = False 1149 ) -> list: 1150 """ 1151 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1152 checking and filtering based on input parameters. 1153 1154 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1155 parameter that determines whether to check if the samples in the list are properly defined as 1156 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1157 list is defined as a, defaults to False 1158 :type check: bool (optional) 1159 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1160 allows you to specify a subset of samples from the header. If you provide a list of sample 1161 names, the function will check if each sample is defined in the header. If a sample is not found 1162 in the 1163 :type samples: list 1164 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1165 a boolean parameter that determines whether to force the function to return the sample list 1166 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1167 function will return the sample list without performing, defaults to False 1168 :type samples_force: bool (optional) 1169 :return: The function `get_header_sample_list` returns a list of samples based on the input 1170 parameters and conditions specified in the function. 1171 """ 1172 1173 # Init 1174 samples_list = [] 1175 1176 if samples is None: 1177 samples_list = self.header_vcf.samples 1178 else: 1179 samples_checked = [] 1180 for sample in samples: 1181 if sample in self.header_vcf.samples: 1182 samples_checked.append(sample) 1183 else: 1184 log.warning(f"Sample '{sample}' not defined in header") 1185 samples_list = samples_checked 1186 1187 # Force sample list without checking if is_genotype_column 1188 if samples_force: 1189 log.warning(f"Samples {samples_list} not checked if genotypes") 1190 return samples_list 1191 1192 if check: 1193 samples_checked = [] 1194 for sample in samples_list: 1195 if self.is_genotype_column(column=sample): 1196 samples_checked.append(sample) 1197 else: 1198 log.warning( 1199 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1200 ) 1201 samples_list = samples_checked 1202 1203 # Return samples list 1204 return samples_list
The function get_header_sample_list returns a list of samples from a VCF header, with optional
checking and filtering based on input parameters.
Parameters
- check: The
checkparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. Ifcheckis set toTrue, the function will verify if each sample in the list is defined as a, defaults to False - samples: The
samplesparameter in theget_header_sample_listfunction is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the - samples_force: The
samples_forceparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. Ifsamples_forceis set toTrue, the function will return the sample list without performing, defaults to False
Returns
The function
get_header_sample_listreturns a list of samples based on the input parameters and conditions specified in the function.
1206 def is_genotype_column(self, column: str = None) -> bool: 1207 """ 1208 This function checks if a given column is a genotype column in a database. 1209 1210 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1211 represents the column name in a database table. This method checks if the specified column is a 1212 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1213 method of 1214 :type column: str 1215 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1216 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1217 column name and returns the result. If the `column` parameter is None, it returns False. 1218 """ 1219 1220 if column is not None: 1221 return Database(database=self.get_input()).is_genotype_column(column=column) 1222 else: 1223 return False
This function checks if a given column is a genotype column in a database.
Parameters
- column: The
columnparameter in theis_genotype_columnmethod is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls theis_genotype_columnmethod of
Returns
The
is_genotype_columnmethod is returning a boolean value. If thecolumnparameter is not None, it calls theis_genotype_columnmethod of theDatabaseclass with the specified column name and returns the result. If thecolumnparameter is None, it returns False.
1225 def get_verbose(self) -> bool: 1226 """ 1227 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1228 exist 1229 1230 :return: The value of the key "verbose" in the config dictionary. 1231 """ 1232 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1234 def get_connexion_format(self) -> str: 1235 """ 1236 It returns the connexion format of the object. 1237 :return: The connexion_format is being returned. 1238 """ 1239 connexion_format = self.connexion_format 1240 if connexion_format not in ["duckdb", "sqlite"]: 1241 log.error(f"Unknown connexion format {connexion_format}") 1242 raise ValueError(f"Unknown connexion format {connexion_format}") 1243 else: 1244 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1246 def insert_file_to_table( 1247 self, 1248 file, 1249 columns: str, 1250 header_len: int = 0, 1251 sep: str = "\t", 1252 chunksize: int = 1000000, 1253 ) -> None: 1254 """ 1255 The function reads a file in chunks and inserts each chunk into a table based on the specified 1256 database format. 1257 1258 :param file: The `file` parameter is the file that you want to load into a table. It should be 1259 the path to the file on your system 1260 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1261 should contain the names of the columns in the table where the data will be inserted. The column 1262 names should be separated by commas within the string. For example, if you have columns named 1263 "id", "name 1264 :type columns: str 1265 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1266 the number of lines to skip at the beginning of the file before reading the actual data. This 1267 parameter allows you to skip any header information present in the file before processing the 1268 data, defaults to 0 1269 :type header_len: int (optional) 1270 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1271 separator character that is used in the file being read. In this case, the default separator is 1272 set to `\t`, which represents a tab character. You can change this parameter to a different 1273 separator character if, defaults to \t 1274 :type sep: str (optional) 1275 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1276 when processing the file in chunks. In the provided code snippet, the default value for 1277 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1278 to 1000000 1279 :type chunksize: int (optional) 1280 """ 1281 1282 # Config 1283 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1284 connexion_format = self.get_connexion_format() 1285 1286 log.debug("chunksize: " + str(chunksize)) 1287 1288 if chunksize: 1289 for chunk in pd.read_csv( 1290 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1291 ): 1292 if connexion_format in ["duckdb"]: 1293 sql_insert_into = ( 1294 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1295 ) 1296 self.conn.execute(sql_insert_into) 1297 elif connexion_format in ["sqlite"]: 1298 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks and inserts each chunk into a table based on the specified database format.
Parameters
- file: The
fileparameter is the file that you want to load into a table. It should be the path to the file on your system - columns: The
columnsparameter in theinsert_file_to_tablefunction is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name - header_len: The
header_lenparameter in theinsert_file_to_tablefunction specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0 - sep: The
sepparameter in theinsert_file_to_tablefunction is used to specify the separator character that is used in the file being read. In this case, the default separator is set to, which represents a tab character. You can change this parameter to a different separator character if, defaults to - chunksize: The
chunksizeparameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value forchunksizeis set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
1300 def load_data( 1301 self, 1302 input_file: str = None, 1303 drop_variants_table: bool = False, 1304 sample_size: int = 20480, 1305 ) -> None: 1306 """ 1307 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1308 table before loading the data and specify a sample size. 1309 1310 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1311 table 1312 :type input_file: str 1313 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1314 determines whether the variants table should be dropped before loading the data. If set to 1315 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1316 not be dropped, defaults to False 1317 :type drop_variants_table: bool (optional) 1318 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1319 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1320 20480 1321 :type sample_size: int (optional) 1322 """ 1323 1324 log.info("Loading...") 1325 1326 # change input file 1327 if input_file: 1328 self.set_input(input_file) 1329 self.set_header() 1330 1331 # drop variants table 1332 if drop_variants_table: 1333 self.drop_variants_table() 1334 1335 # get table variants 1336 table_variants = self.get_table_variants() 1337 1338 # Access 1339 access = self.get_config().get("access", None) 1340 log.debug(f"access: {access}") 1341 1342 # Input format and compress 1343 input_format = self.get_input_format() 1344 input_compressed = self.get_input_compressed() 1345 log.debug(f"input_format: {input_format}") 1346 log.debug(f"input_compressed: {input_compressed}") 1347 1348 # input_compressed_format 1349 if input_compressed: 1350 input_compressed_format = "gzip" 1351 else: 1352 input_compressed_format = "none" 1353 log.debug(f"input_compressed_format: {input_compressed_format}") 1354 1355 # Connexion format 1356 connexion_format = self.get_connexion_format() 1357 1358 # Sample size 1359 if not sample_size: 1360 sample_size = -1 1361 log.debug(f"sample_size: {sample_size}") 1362 1363 # Load data 1364 log.debug(f"Load Data from {input_format}") 1365 1366 # DuckDB connexion 1367 if connexion_format in ["duckdb"]: 1368 1369 # Database already exists 1370 if self.input_format in ["db", "duckdb"]: 1371 1372 if connexion_format in ["duckdb"]: 1373 log.debug(f"Input file format '{self.input_format}' duckDB") 1374 else: 1375 log.error( 1376 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1377 ) 1378 raise ValueError( 1379 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1380 ) 1381 1382 # Load from existing database format 1383 else: 1384 1385 try: 1386 # Create Table or View 1387 database = Database(database=self.input) 1388 sql_from = database.get_sql_from(sample_size=sample_size) 1389 1390 if access in ["RO"]: 1391 sql_load = ( 1392 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1393 ) 1394 else: 1395 sql_load = ( 1396 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1397 ) 1398 self.conn.execute(sql_load) 1399 1400 except: 1401 # Format not available 1402 log.error(f"Input file format '{self.input_format}' not available") 1403 raise ValueError( 1404 f"Input file format '{self.input_format}' not available" 1405 ) 1406 1407 # SQLite connexion 1408 elif connexion_format in ["sqlite"] and input_format in [ 1409 "vcf", 1410 "tsv", 1411 "csv", 1412 "psv", 1413 ]: 1414 1415 # Main structure 1416 structure = { 1417 "#CHROM": "VARCHAR", 1418 "POS": "INTEGER", 1419 "ID": "VARCHAR", 1420 "REF": "VARCHAR", 1421 "ALT": "VARCHAR", 1422 "QUAL": "VARCHAR", 1423 "FILTER": "VARCHAR", 1424 "INFO": "VARCHAR", 1425 } 1426 1427 # Strcuture with samples 1428 structure_complete = structure 1429 if self.get_header_sample_list(): 1430 structure["FORMAT"] = "VARCHAR" 1431 for sample in self.get_header_sample_list(): 1432 structure_complete[sample] = "VARCHAR" 1433 1434 # Columns list for create and insert 1435 sql_create_table_columns = [] 1436 sql_create_table_columns_list = [] 1437 for column in structure_complete: 1438 column_type = structure_complete[column] 1439 sql_create_table_columns.append( 1440 f'"{column}" {column_type} default NULL' 1441 ) 1442 sql_create_table_columns_list.append(f'"{column}"') 1443 1444 # Create database 1445 log.debug(f"Create Table {table_variants}") 1446 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1447 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1448 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1449 self.conn.execute(sql_create_table) 1450 1451 # chunksize define length of file chunk load file 1452 chunksize = 100000 1453 1454 # delimiter 1455 delimiter = file_format_delimiters.get(input_format, "\t") 1456 1457 # Load the input file 1458 with open(self.input, "rt") as input_file: 1459 1460 # Use the appropriate file handler based on the input format 1461 if input_compressed: 1462 input_file = bgzf.open(self.input, "rt") 1463 if input_format in ["vcf"]: 1464 header_len = self.get_header_length() 1465 else: 1466 header_len = 0 1467 1468 # Insert the file contents into a table 1469 self.insert_file_to_table( 1470 input_file, 1471 columns=sql_create_table_columns_list_sql, 1472 header_len=header_len, 1473 sep=delimiter, 1474 chunksize=chunksize, 1475 ) 1476 1477 else: 1478 log.error( 1479 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1480 ) 1481 raise ValueError( 1482 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1483 ) 1484 1485 # Explode INFOS fields into table fields 1486 if self.get_explode_infos(): 1487 self.explode_infos( 1488 prefix=self.get_explode_infos_prefix(), 1489 fields=self.get_explode_infos_fields(), 1490 force=True, 1491 ) 1492 1493 # Create index after insertion 1494 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1496 def get_explode_infos(self) -> bool: 1497 """ 1498 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1499 to False if it is not set. 1500 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1501 value. If the parameter is not present, it will return False. 1502 """ 1503 1504 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1506 def get_explode_infos_fields( 1507 self, 1508 explode_infos_fields: str = None, 1509 remove_fields_not_in_header: bool = False, 1510 ) -> list: 1511 """ 1512 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1513 the input parameter `explode_infos_fields`. 1514 1515 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1516 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1517 comma-separated list of field names to explode 1518 :type explode_infos_fields: str 1519 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1520 flag that determines whether to remove fields that are not present in the header. If it is set 1521 to `True`, any field that is not in the header will be excluded from the list of exploded 1522 information fields. If it is set to `, defaults to False 1523 :type remove_fields_not_in_header: bool (optional) 1524 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1525 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1526 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1527 Otherwise, it returns a list of exploded information fields after removing any spaces and 1528 splitting the string by commas. 1529 """ 1530 1531 # If no fields, get it in param 1532 if not explode_infos_fields: 1533 explode_infos_fields = ( 1534 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1535 ) 1536 1537 # If no fields, defined as all fields in header using keyword 1538 if not explode_infos_fields: 1539 explode_infos_fields = "*" 1540 1541 # If fields list not empty 1542 if explode_infos_fields: 1543 1544 # Input fields list 1545 if isinstance(explode_infos_fields, str): 1546 fields_input = explode_infos_fields.split(",") 1547 elif isinstance(explode_infos_fields, list): 1548 fields_input = explode_infos_fields 1549 else: 1550 fields_input = [] 1551 1552 # Fields list without * keyword 1553 fields_without_all = fields_input.copy() 1554 if "*".casefold() in (item.casefold() for item in fields_without_all): 1555 fields_without_all.remove("*") 1556 1557 # Fields in header 1558 fields_in_header = sorted(list(set(self.get_header().infos))) 1559 1560 # Construct list of fields 1561 fields_output = [] 1562 for field in fields_input: 1563 1564 # Strip field 1565 field = field.strip() 1566 1567 # format keyword * in regex 1568 if field.upper() in ["*"]: 1569 field = ".*" 1570 1571 # Find all fields with pattern 1572 r = re.compile(field) 1573 fields_search = sorted(list(filter(r.match, fields_in_header))) 1574 1575 # Remove fields input from search 1576 if field in fields_search: 1577 fields_search = [field] 1578 elif fields_search != [field]: 1579 fields_search = sorted( 1580 list(set(fields_search).difference(fields_input)) 1581 ) 1582 1583 # If field is not in header (avoid not well formatted header) 1584 if not fields_search and not remove_fields_not_in_header: 1585 fields_search = [field] 1586 1587 # Add found fields 1588 for new_field in fields_search: 1589 # Add field, if not already exists, and if it is in header (if asked) 1590 if ( 1591 new_field not in fields_output 1592 and ( 1593 not remove_fields_not_in_header 1594 or new_field in fields_in_header 1595 ) 1596 and new_field not in [".*"] 1597 ): 1598 fields_output.append(new_field) 1599 1600 return fields_output 1601 1602 else: 1603 1604 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1606 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1607 """ 1608 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1609 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1610 not provided. 1611 1612 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1613 prefix to be used for exploding or expanding information 1614 :type explode_infos_prefix: str 1615 :return: the value of the variable `explode_infos_prefix`. 1616 """ 1617 1618 if not explode_infos_prefix: 1619 explode_infos_prefix = ( 1620 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1621 ) 1622 1623 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1625 def add_column( 1626 self, 1627 table_name, 1628 column_name, 1629 column_type, 1630 default_value=None, 1631 drop: bool = False, 1632 ) -> dict: 1633 """ 1634 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1635 doesn't already exist. 1636 1637 :param table_name: The name of the table to which you want to add a column 1638 :param column_name: The parameter "column_name" is the name of the column that you want to add 1639 to the table 1640 :param column_type: The `column_type` parameter specifies the data type of the column that you 1641 want to add to the table. It should be a string that represents the desired data type, such as 1642 "INTEGER", "TEXT", "REAL", etc 1643 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1644 default value for the newly added column. If a default value is provided, it will be assigned to 1645 the column for any existing rows that do not have a value for that column 1646 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1647 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1648 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1649 to False 1650 :type drop: bool (optional) 1651 :return: a boolean value indicating whether the column was successfully added to the table. 1652 """ 1653 1654 # added 1655 added = False 1656 dropped = False 1657 1658 # Check if the column already exists in the table 1659 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1660 columns = self.get_query_to_df(query).columns.tolist() 1661 if column_name.upper() in [c.upper() for c in columns]: 1662 log.debug( 1663 f"The {column_name} column already exists in the {table_name} table" 1664 ) 1665 if drop: 1666 self.drop_column(table_name=table_name, column_name=column_name) 1667 dropped = True 1668 else: 1669 return None 1670 else: 1671 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1672 1673 # Add column in table 1674 add_column_query = ( 1675 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1676 ) 1677 if default_value is not None: 1678 add_column_query += f" DEFAULT {default_value}" 1679 self.execute_query(add_column_query) 1680 added = not dropped 1681 log.debug( 1682 f"The {column_name} column was successfully added to the {table_name} table" 1683 ) 1684 1685 if added: 1686 added_column = { 1687 "table_name": table_name, 1688 "column_name": column_name, 1689 "column_type": column_type, 1690 "default_value": default_value, 1691 } 1692 else: 1693 added_column = None 1694 1695 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1697 def drop_column( 1698 self, column: dict = None, table_name: str = None, column_name: str = None 1699 ) -> bool: 1700 """ 1701 The `drop_column` function drops a specified column from a given table in a database and returns 1702 True if the column was successfully dropped, and False if the column does not exist in the 1703 table. 1704 1705 :param column: The `column` parameter is a dictionary that contains information about the column 1706 you want to drop. It has two keys: 1707 :type column: dict 1708 :param table_name: The `table_name` parameter is the name of the table from which you want to 1709 drop a column 1710 :type table_name: str 1711 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1712 from the table 1713 :type column_name: str 1714 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1715 and False if the column does not exist in the table. 1716 """ 1717 1718 # Find column infos 1719 if column: 1720 if isinstance(column, dict): 1721 table_name = column.get("table_name", None) 1722 column_name = column.get("column_name", None) 1723 elif isinstance(column, str): 1724 table_name = self.get_table_variants() 1725 column_name = column 1726 else: 1727 table_name = None 1728 column_name = None 1729 1730 if not table_name and not column_name: 1731 return False 1732 1733 # Removed 1734 removed = False 1735 1736 # Check if the column already exists in the table 1737 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1738 columns = self.get_query_to_df(query).columns.tolist() 1739 if column_name in columns: 1740 log.debug(f"The {column_name} column exists in the {table_name} table") 1741 else: 1742 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1743 return False 1744 1745 # Add column in table # ALTER TABLE integers DROP k 1746 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1747 self.execute_query(add_column_query) 1748 removed = True 1749 log.debug( 1750 f"The {column_name} column was successfully dropped to the {table_name} table" 1751 ) 1752 1753 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1755 def explode_infos( 1756 self, 1757 prefix: str = None, 1758 create_index: bool = False, 1759 fields: list = None, 1760 force: bool = False, 1761 proccess_all_fields_together: bool = False, 1762 table: str = None, 1763 ) -> list: 1764 """ 1765 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1766 individual columns, returning a list of added columns. 1767 1768 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1769 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1770 `self.get_explode_infos_prefix()` as the prefix 1771 :type prefix: str 1772 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1773 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1774 `False`, indexes will not be created. The default value is `False`, defaults to False 1775 :type create_index: bool (optional) 1776 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1777 that you want to explode into individual columns. If this parameter is not provided, all INFO 1778 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1779 a list to the ` 1780 :type fields: list 1781 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1782 determines whether to drop and recreate a column if it already exists in the table. If `force` 1783 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1784 defaults to False 1785 :type force: bool (optional) 1786 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1787 flag that determines whether to process all the INFO fields together or individually. If set to 1788 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1789 be processed individually. The default value is, defaults to False 1790 :type proccess_all_fields_together: bool (optional) 1791 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1792 of the table where the exploded INFO fields will be added as individual columns. If you provide 1793 a value for the `table` parameter, the function will use that table name. If the `table` 1794 parameter is 1795 :type table: str 1796 :return: The `explode_infos` function returns a list of added columns. 1797 """ 1798 1799 # drop indexes 1800 self.drop_indexes() 1801 1802 # connexion format 1803 connexion_format = self.get_connexion_format() 1804 1805 # Access 1806 access = self.get_config().get("access", None) 1807 1808 # Added columns 1809 added_columns = [] 1810 1811 if access not in ["RO"]: 1812 1813 # prefix 1814 if prefix in [None, True] or not isinstance(prefix, str): 1815 if self.get_explode_infos_prefix() not in [None, True]: 1816 prefix = self.get_explode_infos_prefix() 1817 else: 1818 prefix = "INFO/" 1819 1820 # table variants 1821 if table is not None: 1822 table_variants = table 1823 else: 1824 table_variants = self.get_table_variants(clause="select") 1825 1826 # extra infos 1827 try: 1828 extra_infos = self.get_extra_infos() 1829 except: 1830 extra_infos = [] 1831 1832 # Header infos 1833 header_infos = self.get_header().infos 1834 1835 log.debug( 1836 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1837 ) 1838 1839 sql_info_alter_table_array = [] 1840 1841 # Info fields to check 1842 fields_list = list(header_infos) 1843 if fields: 1844 fields_list += fields 1845 fields_list = set(fields_list) 1846 1847 # If no fields 1848 if not fields: 1849 fields = [] 1850 1851 # Translate fields if patterns 1852 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1853 1854 for info in fields: 1855 1856 info_id_sql = prefix + info 1857 1858 if ( 1859 info in fields_list 1860 or prefix + info in fields_list 1861 or info in extra_infos 1862 ): 1863 1864 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1865 1866 if info in header_infos: 1867 info_type = header_infos[info].type 1868 info_num = header_infos[info].num 1869 else: 1870 info_type = "String" 1871 info_num = 0 1872 1873 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1874 if info_num != 1: 1875 type_sql = "VARCHAR" 1876 1877 # Add field 1878 added_column = self.add_column( 1879 table_name=table_variants, 1880 column_name=info_id_sql, 1881 column_type=type_sql, 1882 default_value="null", 1883 drop=force, 1884 ) 1885 1886 if added_column: 1887 added_columns.append(added_column) 1888 1889 if added_column or force: 1890 1891 # add field to index 1892 self.index_additionnal_fields.append(info_id_sql) 1893 1894 # Update field array 1895 if connexion_format in ["duckdb"]: 1896 update_info_field = f""" 1897 "{info_id_sql}" = 1898 CASE 1899 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1900 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1901 END 1902 """ 1903 elif connexion_format in ["sqlite"]: 1904 update_info_field = f""" 1905 "{info_id_sql}" = 1906 CASE 1907 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1908 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1909 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1910 END 1911 """ 1912 1913 sql_info_alter_table_array.append(update_info_field) 1914 1915 if sql_info_alter_table_array: 1916 1917 # By chromosomes 1918 try: 1919 chromosomes_list = list( 1920 self.get_query_to_df( 1921 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1922 )["#CHROM"] 1923 ) 1924 except: 1925 chromosomes_list = [None] 1926 1927 for chrom in chromosomes_list: 1928 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1929 1930 # Where clause 1931 where_clause = "" 1932 if chrom and len(chromosomes_list) > 1: 1933 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1934 1935 # Update table 1936 if proccess_all_fields_together: 1937 sql_info_alter_table_array_join = ", ".join( 1938 sql_info_alter_table_array 1939 ) 1940 if sql_info_alter_table_array_join: 1941 sql_info_alter_table = f""" 1942 UPDATE {table_variants} 1943 SET {sql_info_alter_table_array_join} 1944 {where_clause} 1945 """ 1946 log.debug( 1947 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1948 ) 1949 # log.debug(sql_info_alter_table) 1950 self.conn.execute(sql_info_alter_table) 1951 else: 1952 sql_info_alter_num = 0 1953 for sql_info_alter in sql_info_alter_table_array: 1954 sql_info_alter_num += 1 1955 sql_info_alter_table = f""" 1956 UPDATE {table_variants} 1957 SET {sql_info_alter} 1958 {where_clause} 1959 """ 1960 log.debug( 1961 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1962 ) 1963 # log.debug(sql_info_alter_table) 1964 self.conn.execute(sql_info_alter_table) 1965 1966 # create indexes 1967 if create_index: 1968 self.create_indexes() 1969 1970 return added_columns
The explode_infos function in Python takes a VCF file and explodes the INFO fields into
individual columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter in theexplode_infosfunction is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the ` - force: The
forceparameter in theexplode_infosfunction is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set to `False, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually. The default value is, defaults to False - table: The
tableparameter in theexplode_infosfunction is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for thetableparameter, the function will use that table name. If thetableparameter is
Returns
The
explode_infosfunction returns a list of added columns.
1972 def create_indexes(self) -> None: 1973 """ 1974 Create indexes on the table after insertion 1975 """ 1976 1977 # Access 1978 access = self.get_config().get("access", None) 1979 1980 # get table variants 1981 table_variants = self.get_table_variants("FROM") 1982 1983 if self.get_indexing() and access not in ["RO"]: 1984 # Create index 1985 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1986 self.conn.execute(sql_create_table_index) 1987 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1988 self.conn.execute(sql_create_table_index) 1989 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1990 self.conn.execute(sql_create_table_index) 1991 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1992 self.conn.execute(sql_create_table_index) 1993 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1994 self.conn.execute(sql_create_table_index) 1995 for field in self.index_additionnal_fields: 1996 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1997 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
1999 def drop_indexes(self) -> None: 2000 """ 2001 Create indexes on the table after insertion 2002 """ 2003 2004 # Access 2005 access = self.get_config().get("access", None) 2006 2007 # get table variants 2008 table_variants = self.get_table_variants("FROM") 2009 2010 # Get database format 2011 connexion_format = self.get_connexion_format() 2012 2013 if access not in ["RO"]: 2014 if connexion_format in ["duckdb"]: 2015 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2016 elif connexion_format in ["sqlite"]: 2017 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2018 2019 list_indexes = self.conn.execute(sql_list_indexes) 2020 index_names = [row[0] for row in list_indexes.fetchall()] 2021 for index in index_names: 2022 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2023 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
2025 def read_vcf_header(self, f) -> list: 2026 """ 2027 It reads the header of a VCF file and returns a list of the header lines 2028 2029 :param f: the file object 2030 :return: The header lines of the VCF file. 2031 """ 2032 2033 header_list = [] 2034 for line in f: 2035 header_list.append(line) 2036 if line.startswith("#CHROM"): 2037 break 2038 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
2040 def read_vcf_header_file(self, file: str = None) -> list: 2041 """ 2042 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2043 uncompressed files. 2044 2045 :param file: The `file` parameter is a string that represents the path to the VCF header file 2046 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2047 default to `None` 2048 :type file: str 2049 :return: The function `read_vcf_header_file` returns a list. 2050 """ 2051 2052 if self.get_input_compressed(input_file=file): 2053 with bgzf.open(file, "rt") as f: 2054 return self.read_vcf_header(f=f) 2055 else: 2056 with open(file, "rt") as f: 2057 return self.read_vcf_header(f=f)
The read_vcf_header_file function reads the header of a VCF file, handling both compressed and
uncompressed files.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone
Returns
The function
read_vcf_header_filereturns a list.
2059 def execute_query(self, query: str): 2060 """ 2061 It takes a query as an argument, executes it, and returns the results 2062 2063 :param query: The query to be executed 2064 :return: The result of the query is being returned. 2065 """ 2066 if query: 2067 return self.conn.execute(query) # .fetchall() 2068 else: 2069 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
2071 def export_output( 2072 self, 2073 output_file: str | None = None, 2074 output_header: str | None = None, 2075 export_header: bool = True, 2076 query: str | None = None, 2077 parquet_partitions: list | None = None, 2078 chunk_size: int | None = None, 2079 threads: int | None = None, 2080 sort: bool = False, 2081 index: bool = False, 2082 order_by: str | None = None, 2083 ) -> bool: 2084 """ 2085 The `export_output` function exports data from a VCF file to a specified output file in various 2086 formats, including VCF, CSV, TSV, PSV, and Parquet. 2087 2088 :param output_file: The `output_file` parameter is a string that specifies the name of the 2089 output file to be generated by the function. This is where the exported data will be saved 2090 :type output_file: str 2091 :param output_header: The `output_header` parameter is a string that specifies the name of the 2092 file where the header of the VCF file will be exported. If this parameter is not provided, the 2093 header will be exported to a file with the same name as the `output_file` parameter, but with 2094 the extension " 2095 :type output_header: str 2096 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2097 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2098 True, the header will be exported to a file. If `export_header` is False, the header will not 2099 be, defaults to True, if output format is not VCF 2100 :type export_header: bool (optional) 2101 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2102 select specific data from the VCF file before exporting it. If provided, only the data that 2103 matches the query will be exported 2104 :type query: str 2105 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2106 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2107 organize data in a hierarchical directory structure based on the values of one or more columns. 2108 This can improve query performance when working with large datasets 2109 :type parquet_partitions: list 2110 :param chunk_size: The `chunk_size` parameter specifies the number of 2111 records in batch when exporting data in Parquet format. This parameter is used for 2112 partitioning the Parquet file into multiple files. 2113 :type chunk_size: int 2114 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2115 threads to be used during the export process. It determines the level of parallelism and can 2116 improve the performance of the export operation. If not provided, the function will use the 2117 default number of threads 2118 :type threads: int 2119 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2120 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2121 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2122 False 2123 :type sort: bool (optional) 2124 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2125 created on the output file. If `index` is True, an index will be created. If `index` is False, 2126 no index will be created. The default value is False, defaults to False 2127 :type index: bool (optional) 2128 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2129 sorting the output file. This parameter is only applicable when exporting data in VCF format 2130 :type order_by: str 2131 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2132 None if it doesn't. 2133 """ 2134 2135 # Log 2136 log.info("Exporting...") 2137 2138 # Full path 2139 output_file = full_path(output_file) 2140 output_header = full_path(output_header) 2141 2142 # Config 2143 config = self.get_config() 2144 2145 # Param 2146 param = self.get_param() 2147 2148 # Tmp files to remove 2149 tmp_to_remove = [] 2150 2151 # If no output, get it 2152 if not output_file: 2153 output_file = self.get_output() 2154 2155 # If not threads 2156 if not threads: 2157 threads = self.get_threads() 2158 2159 # Auto header name with extension 2160 if export_header or output_header: 2161 if not output_header: 2162 output_header = f"{output_file}.hdr" 2163 # Export header 2164 self.export_header(output_file=output_file) 2165 2166 # Switch off export header if VCF output 2167 output_file_type = get_file_format(output_file) 2168 if output_file_type in ["vcf"]: 2169 export_header = False 2170 tmp_to_remove.append(output_header) 2171 2172 # Chunk size 2173 if not chunk_size: 2174 chunk_size = config.get("chunk_size", None) 2175 2176 # Parquet partition 2177 if not parquet_partitions: 2178 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2179 if parquet_partitions and isinstance(parquet_partitions, str): 2180 parquet_partitions = parquet_partitions.split(",") 2181 2182 # Order by 2183 if not order_by: 2184 order_by = param.get("export", {}).get("order_by", "") 2185 2186 # Header in output 2187 header_in_output = param.get("export", {}).get("include_header", False) 2188 2189 # Database 2190 database_source = self.get_connexion() 2191 2192 # Connexion format 2193 connexion_format = self.get_connexion_format() 2194 2195 # Explode infos 2196 if self.get_explode_infos(): 2197 self.explode_infos( 2198 prefix=self.get_explode_infos_prefix(), 2199 fields=self.get_explode_infos_fields(), 2200 force=False, 2201 ) 2202 2203 # if connexion_format in ["sqlite"] or query: 2204 if connexion_format in ["sqlite"]: 2205 2206 # Export in Parquet 2207 random_tmp = "".join( 2208 random.choice(string.ascii_lowercase) for i in range(10) 2209 ) 2210 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2211 tmp_to_remove.append(database_source) 2212 2213 # Table Variants 2214 table_variants = self.get_table_variants() 2215 2216 # Create export query 2217 sql_query_export_subquery = f""" 2218 SELECT * FROM {table_variants} 2219 """ 2220 2221 # Write source file 2222 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2223 2224 # Create database 2225 database = Database( 2226 database=database_source, 2227 table="variants", 2228 header_file=output_header, 2229 conn_config=self.get_connexion_config(), 2230 ) 2231 2232 # Existing colomns header 2233 existing_columns_header = database.get_header_columns_from_database() 2234 2235 # Sample list 2236 get_samples = self.get_samples() 2237 get_samples_check = self.get_samples_check() 2238 samples_force = get_samples is not None 2239 sample_list = self.get_header_sample_list( 2240 check=get_samples_check, samples=get_samples, samples_force=samples_force 2241 ) 2242 2243 # Export file 2244 database.export( 2245 output_database=output_file, 2246 output_header=output_header, 2247 existing_columns_header=existing_columns_header, 2248 parquet_partitions=parquet_partitions, 2249 chunk_size=chunk_size, 2250 threads=threads, 2251 sort=sort, 2252 index=index, 2253 header_in_output=header_in_output, 2254 order_by=order_by, 2255 query=query, 2256 export_header=export_header, 2257 sample_list=sample_list, 2258 ) 2259 2260 # Remove 2261 remove_if_exists(tmp_to_remove) 2262 2263 return (os.path.exists(output_file) or None) and ( 2264 os.path.exists(output_file) or None 2265 )
The export_output function exports data from a VCF file to a specified output file in various
formats, including VCF, CSV, TSV, PSV, and Parquet.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True, if output format is not VCF - query: The
queryparameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. - threads: The
threadsparameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads - sort: The
sortparameter is a boolean flag that determines whether the output file should be sorted or not. Ifsortis set toTrue, the output file will be sorted based on the genomic coordinates of the variants. By default, the value ofsortisFalse, defaults to False - index: The
indexparameter is a boolean flag that determines whether an index should be created on the output file. Ifindexis True, an index will be created. Ifindexis False, no index will be created. The default value is False, defaults to False - order_by: The
order_byparameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns
a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2267 def get_extra_infos(self, table: str = None) -> list: 2268 """ 2269 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2270 in the header. 2271 2272 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2273 name of the table from which you want to retrieve the extra columns that are not present in the 2274 header. If the `table` parameter is not provided when calling the function, it will default to 2275 using the variants 2276 :type table: str 2277 :return: A list of columns that are in the specified table but not in the header of the table. 2278 """ 2279 2280 header_columns = [] 2281 2282 if not table: 2283 table = self.get_table_variants(clause="from") 2284 header_columns = self.get_header_columns() 2285 2286 # Check all columns in the database 2287 query = f""" SELECT * FROM {table} LIMIT 1 """ 2288 log.debug(f"query {query}") 2289 table_columns = self.get_query_to_df(query).columns.tolist() 2290 extra_columns = [] 2291 2292 # Construct extra infos (not in header) 2293 for column in table_columns: 2294 if column not in header_columns: 2295 extra_columns.append(column) 2296 2297 return extra_columns
The get_extra_infos function returns a list of columns that are in a specified table but not
in the header.
Parameters
- table: The
tableparameter in theget_extra_infosfunction is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If thetableparameter is not provided when calling the function, it will default to using the variants
Returns
A list of columns that are in the specified table but not in the header of the table.
2299 def get_extra_infos_sql(self, table: str = None) -> str: 2300 """ 2301 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2302 by double quotes 2303 2304 :param table: The name of the table to get the extra infos from. If None, the default table is 2305 used 2306 :type table: str 2307 :return: A string of the extra infos 2308 """ 2309 2310 return ", ".join( 2311 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2312 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2314 def export_header( 2315 self, 2316 header_name: str = None, 2317 output_file: str = None, 2318 output_file_ext: str = ".hdr", 2319 clean_header: bool = True, 2320 remove_chrom_line: bool = False, 2321 ) -> str: 2322 """ 2323 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2324 specified options, and writes it to a new file. 2325 2326 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2327 this parameter is not specified, the header will be written to the output file 2328 :type header_name: str 2329 :param output_file: The `output_file` parameter in the `export_header` function is used to 2330 specify the name of the output file where the header will be written. If this parameter is not 2331 provided, the header will be written to a temporary file 2332 :type output_file: str 2333 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2334 string that represents the extension of the output header file. By default, it is set to ".hdr" 2335 if not specified by the user. This extension will be appended to the `output_file` name to 2336 create the final, defaults to .hdr 2337 :type output_file_ext: str (optional) 2338 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2339 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2340 `True`, the function will clean the header by modifying certain lines based on a specific 2341 pattern. If `clean_header`, defaults to True 2342 :type clean_header: bool (optional) 2343 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2344 boolean flag that determines whether the #CHROM line should be removed from the header before 2345 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2346 defaults to False 2347 :type remove_chrom_line: bool (optional) 2348 :return: The function `export_header` returns the name of the temporary header file that is 2349 created. 2350 """ 2351 2352 if not header_name and not output_file: 2353 output_file = self.get_output() 2354 2355 if self.get_header(): 2356 2357 # Get header object 2358 header_obj = self.get_header() 2359 2360 # Create database 2361 db_for_header = Database(database=self.get_input()) 2362 2363 # Get real columns in the file 2364 db_header_columns = db_for_header.get_columns() 2365 2366 with tempfile.TemporaryDirectory() as tmpdir: 2367 2368 # Write header file 2369 header_file_tmp = os.path.join(tmpdir, "header") 2370 f = open(header_file_tmp, "w") 2371 vcf.Writer(f, header_obj) 2372 f.close() 2373 2374 # Replace #CHROM line with rel columns 2375 header_list = db_for_header.read_header_file( 2376 header_file=header_file_tmp 2377 ) 2378 header_list[-1] = "\t".join(db_header_columns) 2379 2380 # Remove CHROM line 2381 if remove_chrom_line: 2382 header_list.pop() 2383 2384 # Clean header 2385 if clean_header: 2386 header_list_clean = [] 2387 for head in header_list: 2388 # Clean head for malformed header 2389 head_clean = head 2390 head_clean = re.subn( 2391 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2392 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2393 head_clean, 2394 2, 2395 )[0] 2396 # Write header 2397 header_list_clean.append(head_clean) 2398 header_list = header_list_clean 2399 2400 tmp_header_name = output_file + output_file_ext 2401 2402 f = open(tmp_header_name, "w") 2403 for line in header_list: 2404 f.write(line) 2405 f.close() 2406 2407 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2409 def export_variant_vcf( 2410 self, 2411 vcf_file, 2412 remove_info: bool = False, 2413 add_samples: bool = True, 2414 list_samples: list = [], 2415 where_clause: str = "", 2416 index: bool = False, 2417 threads: int | None = None, 2418 ) -> bool | None: 2419 """ 2420 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2421 remove INFO field, add samples, and control compression and indexing. 2422 2423 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2424 written to. It is the output file that will contain the filtered VCF data based on the specified 2425 parameters 2426 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2427 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2428 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2429 in, defaults to False 2430 :type remove_info: bool (optional) 2431 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2432 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2433 If set to False, the samples will be removed. The default value is True, defaults to True 2434 :type add_samples: bool (optional) 2435 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2436 in the output VCF file. By default, all samples will be included. If you provide a list of 2437 samples, only those samples will be included in the output file 2438 :type list_samples: list 2439 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2440 determines whether or not to create an index for the output VCF file. If `index` is set to 2441 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2442 :type index: bool (optional) 2443 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2444 number of threads to use for exporting the VCF file. It determines how many parallel threads 2445 will be used during the export process. More threads can potentially speed up the export process 2446 by utilizing multiple cores of the processor. If 2447 :type threads: int | None 2448 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2449 method with various parameters including the output file, query, threads, sort flag, and index 2450 flag. The `export_output` method is responsible for exporting the VCF data based on the 2451 specified parameters and configurations provided in the `export_variant_vcf` function. 2452 """ 2453 2454 # Config 2455 config = self.get_config() 2456 2457 # Extract VCF 2458 log.debug("Export VCF...") 2459 2460 # Table variants 2461 table_variants = self.get_table_variants() 2462 2463 # Threads 2464 if not threads: 2465 threads = self.get_threads() 2466 2467 # Info fields 2468 if remove_info: 2469 if not isinstance(remove_info, str): 2470 remove_info = "." 2471 info_field = f"""'{remove_info}' as INFO""" 2472 else: 2473 info_field = "INFO" 2474 2475 # Samples fields 2476 if add_samples: 2477 if not list_samples: 2478 list_samples = self.get_header_sample_list() 2479 if list_samples: 2480 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2481 else: 2482 samples_fields = "" 2483 log.debug(f"samples_fields: {samples_fields}") 2484 else: 2485 samples_fields = "" 2486 2487 # Where clause 2488 if where_clause is None: 2489 where_clause = "" 2490 2491 # Variants 2492 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2493 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2494 log.debug(f"sql_query_select={sql_query_select}") 2495 2496 return self.export_output( 2497 output_file=vcf_file, 2498 output_header=None, 2499 export_header=True, 2500 query=sql_query_select, 2501 parquet_partitions=None, 2502 chunk_size=config.get("chunk_size", None), 2503 threads=threads, 2504 sort=True, 2505 index=index, 2506 order_by=None, 2507 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2509 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2510 """ 2511 It takes a list of commands and runs them in parallel using the number of threads specified 2512 2513 :param commands: A list of commands to run 2514 :param threads: The number of threads to use, defaults to 1 (optional) 2515 """ 2516 2517 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2519 def get_threads(self, default: int = 1) -> int: 2520 """ 2521 This function returns the number of threads to use for a job, with a default value of 1 if not 2522 specified. 2523 2524 :param default: The `default` parameter in the `get_threads` method is used to specify the 2525 default number of threads to use if no specific value is provided. If no value is provided for 2526 the `threads` parameter in the configuration or input parameters, the `default` value will be 2527 used, defaults to 1 2528 :type default: int (optional) 2529 :return: the number of threads to use for the current job. 2530 """ 2531 2532 # Config 2533 config = self.get_config() 2534 2535 # Param 2536 param = self.get_param() 2537 2538 # Input threads 2539 input_thread = param.get("threads", config.get("threads", None)) 2540 2541 # Check threads 2542 if not input_thread: 2543 threads = default 2544 elif int(input_thread) <= 0: 2545 threads = os.cpu_count() 2546 else: 2547 threads = int(input_thread) 2548 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2550 def get_memory(self, default: str = None) -> str: 2551 """ 2552 This function retrieves the memory value from parameters or configuration with a default value 2553 if not found. 2554 2555 :param default: The `get_memory` function takes in a default value as a string parameter. This 2556 default value is used as a fallback in case the `memory` parameter is not provided in the 2557 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2558 the function 2559 :type default: str 2560 :return: The `get_memory` function returns a string value representing the memory parameter. If 2561 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2562 return the default value provided as an argument to the function. 2563 """ 2564 2565 # Config 2566 config = self.get_config() 2567 2568 # Param 2569 param = self.get_param() 2570 2571 # Input threads 2572 input_memory = param.get("memory", config.get("memory", None)) 2573 2574 # Check threads 2575 if input_memory: 2576 memory = input_memory 2577 else: 2578 memory = default 2579 2580 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2582 def update_from_vcf(self, vcf_file: str) -> None: 2583 """ 2584 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2585 2586 :param vcf_file: the path to the VCF file 2587 """ 2588 2589 connexion_format = self.get_connexion_format() 2590 2591 if connexion_format in ["duckdb"]: 2592 self.update_from_vcf_duckdb(vcf_file) 2593 elif connexion_format in ["sqlite"]: 2594 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2596 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2597 """ 2598 It takes a VCF file and updates the INFO column of the variants table in the database with the 2599 INFO column of the VCF file 2600 2601 :param vcf_file: the path to the VCF file 2602 """ 2603 2604 # varaints table 2605 table_variants = self.get_table_variants() 2606 2607 # Loading VCF into temporaire table 2608 skip = self.get_header_length(file=vcf_file) 2609 vcf_df = pd.read_csv( 2610 vcf_file, 2611 sep="\t", 2612 engine="c", 2613 skiprows=skip, 2614 header=0, 2615 low_memory=False, 2616 ) 2617 sql_query_update = f""" 2618 UPDATE {table_variants} as table_variants 2619 SET INFO = concat( 2620 CASE 2621 WHEN INFO NOT IN ('', '.') 2622 THEN INFO 2623 ELSE '' 2624 END, 2625 ( 2626 SELECT 2627 concat( 2628 CASE 2629 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2630 THEN ';' 2631 ELSE '' 2632 END 2633 , 2634 CASE 2635 WHEN table_parquet.INFO NOT IN ('','.') 2636 THEN table_parquet.INFO 2637 ELSE '' 2638 END 2639 ) 2640 FROM vcf_df as table_parquet 2641 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2642 AND table_parquet.\"POS\" = table_variants.\"POS\" 2643 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2644 AND table_parquet.\"REF\" = table_variants.\"REF\" 2645 AND table_parquet.INFO NOT IN ('','.') 2646 ) 2647 ) 2648 ; 2649 """ 2650 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2652 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2653 """ 2654 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2655 table, then updates the INFO column of the variants table with the INFO column of the temporary 2656 table 2657 2658 :param vcf_file: The path to the VCF file you want to update the database with 2659 """ 2660 2661 # Create a temporary table for the VCF 2662 table_vcf = "tmp_vcf" 2663 sql_create = ( 2664 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2665 ) 2666 self.conn.execute(sql_create) 2667 2668 # Loading VCF into temporaire table 2669 vcf_df = pd.read_csv( 2670 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2671 ) 2672 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2673 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2674 2675 # Update table 'variants' with VCF data 2676 # warning: CONCAT as || operator 2677 sql_query_update = f""" 2678 UPDATE variants as table_variants 2679 SET INFO = CASE 2680 WHEN INFO NOT IN ('', '.') 2681 THEN INFO 2682 ELSE '' 2683 END || 2684 ( 2685 SELECT 2686 CASE 2687 WHEN table_variants.INFO NOT IN ('','.') 2688 AND table_vcf.INFO NOT IN ('','.') 2689 THEN ';' 2690 ELSE '' 2691 END || 2692 CASE 2693 WHEN table_vcf.INFO NOT IN ('','.') 2694 THEN table_vcf.INFO 2695 ELSE '' 2696 END 2697 FROM {table_vcf} as table_vcf 2698 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2699 AND table_vcf.\"POS\" = table_variants.\"POS\" 2700 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2701 AND table_vcf.\"REF\" = table_variants.\"REF\" 2702 ) 2703 """ 2704 self.conn.execute(sql_query_update) 2705 2706 # Drop temporary table 2707 sql_drop = f"DROP TABLE {table_vcf}" 2708 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2710 def drop_variants_table(self) -> None: 2711 """ 2712 > This function drops the variants table 2713 """ 2714 2715 table_variants = self.get_table_variants() 2716 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2717 self.conn.execute(sql_table_variants)
This function drops the variants table
2719 def set_variant_id( 2720 self, variant_id_column: str = "variant_id", force: bool = None 2721 ) -> str: 2722 """ 2723 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2724 `#CHROM`, `POS`, `REF`, and `ALT` columns 2725 2726 :param variant_id_column: The name of the column to be created in the variants table, defaults 2727 to variant_id 2728 :type variant_id_column: str (optional) 2729 :param force: If True, the variant_id column will be created even if it already exists 2730 :type force: bool 2731 :return: The name of the column that contains the variant_id 2732 """ 2733 2734 # Assembly 2735 assembly = self.get_param().get( 2736 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2737 ) 2738 2739 # INFO/Tag prefix 2740 prefix = self.get_explode_infos_prefix() 2741 2742 # Explode INFO/SVTYPE 2743 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2744 2745 # variants table 2746 table_variants = self.get_table_variants() 2747 2748 # variant_id column 2749 if not variant_id_column: 2750 variant_id_column = "variant_id" 2751 2752 # Creta variant_id column 2753 if "variant_id" not in self.get_extra_infos() or force: 2754 2755 # Create column 2756 self.add_column( 2757 table_name=table_variants, 2758 column_name=variant_id_column, 2759 column_type="UBIGINT", 2760 default_value="0", 2761 ) 2762 2763 # Update column 2764 self.conn.execute( 2765 f""" 2766 UPDATE {table_variants} 2767 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2768 """ 2769 ) 2770 2771 # Remove added columns 2772 for added_column in added_columns: 2773 self.drop_column(column=added_column) 2774 2775 # return variant_id column name 2776 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2778 def get_variant_id_column( 2779 self, variant_id_column: str = "variant_id", force: bool = None 2780 ) -> str: 2781 """ 2782 This function returns the variant_id column name 2783 2784 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2785 defaults to variant_id 2786 :type variant_id_column: str (optional) 2787 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2788 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2789 if it is not already set, or if it is set 2790 :type force: bool 2791 :return: The variant_id column name. 2792 """ 2793 2794 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2800 def scan_databases( 2801 self, 2802 database_formats: list = ["parquet"], 2803 database_releases: list = ["current"], 2804 ) -> dict: 2805 """ 2806 The function `scan_databases` scans for available databases based on specified formats and 2807 releases. 2808 2809 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2810 of the databases to be scanned. In this case, the accepted format is "parquet" 2811 :type database_formats: list ["parquet"] 2812 :param database_releases: The `database_releases` parameter is a list that specifies the 2813 releases of the databases to be scanned. In the provided function, the default value for 2814 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2815 databases that are in the "current" 2816 :type database_releases: list 2817 :return: The function `scan_databases` returns a dictionary containing information about 2818 databases that match the specified formats and releases. 2819 """ 2820 2821 # Config 2822 config = self.get_config() 2823 2824 # Param 2825 param = self.get_param() 2826 2827 # Param - Assembly 2828 assembly = param.get("assembly", config.get("assembly", None)) 2829 if not assembly: 2830 assembly = DEFAULT_ASSEMBLY 2831 log.warning(f"Default assembly '{assembly}'") 2832 2833 # Scan for availabled databases 2834 log.info( 2835 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2836 ) 2837 databases_infos_dict = databases_infos( 2838 database_folder_releases=database_releases, 2839 database_formats=database_formats, 2840 assembly=assembly, 2841 config=config, 2842 ) 2843 log.info( 2844 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2845 ) 2846 2847 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2849 def annotation(self) -> None: 2850 """ 2851 It annotates the VCF file with the annotations specified in the config file. 2852 """ 2853 2854 # Config 2855 config = self.get_config() 2856 2857 # Param 2858 param = self.get_param() 2859 2860 # Param - Assembly 2861 assembly = param.get("assembly", config.get("assembly", None)) 2862 if not assembly: 2863 assembly = DEFAULT_ASSEMBLY 2864 log.warning(f"Default assembly '{assembly}'") 2865 2866 # annotations databases folders 2867 annotations_databases = set( 2868 config.get("folders", {}) 2869 .get("databases", {}) 2870 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2871 + config.get("folders", {}) 2872 .get("databases", {}) 2873 .get("parquet", ["~/howard/databases/parquet/current"]) 2874 + config.get("folders", {}) 2875 .get("databases", {}) 2876 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2877 ) 2878 2879 # Get param annotations 2880 if param.get("annotations", None) and isinstance( 2881 param.get("annotations", None), str 2882 ): 2883 log.debug(param.get("annotations", None)) 2884 param_annotation_list = param.get("annotations").split(",") 2885 else: 2886 param_annotation_list = [] 2887 2888 # Each tools param 2889 if param.get("annotation_parquet", None) != None: 2890 log.debug( 2891 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2892 ) 2893 if isinstance(param.get("annotation_parquet", None), list): 2894 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2895 else: 2896 param_annotation_list.append(param.get("annotation_parquet")) 2897 if param.get("annotation_snpsift", None) != None: 2898 if isinstance(param.get("annotation_snpsift", None), list): 2899 param_annotation_list.append( 2900 "snpsift:" 2901 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2902 ) 2903 else: 2904 param_annotation_list.append( 2905 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2906 ) 2907 if param.get("annotation_snpeff", None) != None: 2908 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2909 if param.get("annotation_bcftools", None) != None: 2910 if isinstance(param.get("annotation_bcftools", None), list): 2911 param_annotation_list.append( 2912 "bcftools:" 2913 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2914 ) 2915 else: 2916 param_annotation_list.append( 2917 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2918 ) 2919 if param.get("annotation_annovar", None) != None: 2920 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2921 if param.get("annotation_exomiser", None) != None: 2922 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2923 if param.get("annotation_splice", None) != None: 2924 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2925 2926 # Merge param annotations list 2927 param["annotations"] = ",".join(param_annotation_list) 2928 2929 # debug 2930 log.debug(f"param_annotations={param['annotations']}") 2931 2932 if param.get("annotations"): 2933 2934 # Log 2935 # log.info("Annotations - Check annotation parameters") 2936 2937 if not "annotation" in param: 2938 param["annotation"] = {} 2939 2940 # List of annotations parameters 2941 annotations_list_input = {} 2942 if isinstance(param.get("annotations", None), str): 2943 annotation_file_list = [ 2944 value for value in param.get("annotations", "").split(",") 2945 ] 2946 for annotation_file in annotation_file_list: 2947 annotations_list_input[annotation_file] = {"INFO": None} 2948 else: 2949 annotations_list_input = param.get("annotations", {}) 2950 2951 log.info(f"Quick Annotations:") 2952 for annotation_key in list(annotations_list_input.keys()): 2953 log.info(f" {annotation_key}") 2954 2955 # List of annotations and associated fields 2956 annotations_list = {} 2957 2958 for annotation_file in annotations_list_input: 2959 2960 # Explode annotations if ALL 2961 if ( 2962 annotation_file.upper() == "ALL" 2963 or annotation_file.upper().startswith("ALL:") 2964 ): 2965 2966 # check ALL parameters (formats, releases) 2967 annotation_file_split = annotation_file.split(":") 2968 database_formats = "parquet" 2969 database_releases = "current" 2970 for annotation_file_option in annotation_file_split[1:]: 2971 database_all_options_split = annotation_file_option.split("=") 2972 if database_all_options_split[0] == "format": 2973 database_formats = database_all_options_split[1].split("+") 2974 if database_all_options_split[0] == "release": 2975 database_releases = database_all_options_split[1].split("+") 2976 2977 # Scan for availabled databases 2978 databases_infos_dict = self.scan_databases( 2979 database_formats=database_formats, 2980 database_releases=database_releases, 2981 ) 2982 2983 # Add found databases in annotation parameters 2984 for database_infos in databases_infos_dict.keys(): 2985 annotations_list[database_infos] = {"INFO": None} 2986 2987 else: 2988 annotations_list[annotation_file] = annotations_list_input[ 2989 annotation_file 2990 ] 2991 2992 # Check each databases 2993 if len(annotations_list): 2994 2995 log.info( 2996 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2997 ) 2998 2999 for annotation_file in annotations_list: 3000 3001 # Init 3002 annotations = annotations_list.get(annotation_file, None) 3003 3004 # Annotation snpEff 3005 if annotation_file.startswith("snpeff"): 3006 3007 log.debug(f"Quick Annotation snpEff") 3008 3009 if "snpeff" not in param["annotation"]: 3010 param["annotation"]["snpeff"] = {} 3011 3012 if "options" not in param["annotation"]["snpeff"]: 3013 param["annotation"]["snpeff"]["options"] = "" 3014 3015 # snpEff options in annotations 3016 param["annotation"]["snpeff"]["options"] = "".join( 3017 annotation_file.split(":")[1:] 3018 ) 3019 3020 # Annotation Annovar 3021 elif annotation_file.startswith("annovar"): 3022 3023 log.debug(f"Quick Annotation Annovar") 3024 3025 if "annovar" not in param["annotation"]: 3026 param["annotation"]["annovar"] = {} 3027 3028 if "annotations" not in param["annotation"]["annovar"]: 3029 param["annotation"]["annovar"]["annotations"] = {} 3030 3031 # Options 3032 annotation_file_split = annotation_file.split(":") 3033 for annotation_file_annotation in annotation_file_split[1:]: 3034 if annotation_file_annotation: 3035 param["annotation"]["annovar"]["annotations"][ 3036 annotation_file_annotation 3037 ] = annotations 3038 3039 # Annotation Exomiser 3040 elif annotation_file.startswith("exomiser"): 3041 3042 log.debug(f"Quick Annotation Exomiser") 3043 3044 param["annotation"]["exomiser"] = params_string_to_dict( 3045 annotation_file 3046 ) 3047 3048 # Annotation Splice 3049 elif annotation_file.startswith("splice"): 3050 3051 log.debug(f"Quick Annotation Splice") 3052 3053 param["annotation"]["splice"] = params_string_to_dict( 3054 annotation_file 3055 ) 3056 3057 # Annotation Parquet or BCFTOOLS 3058 else: 3059 3060 # Tools detection 3061 if annotation_file.startswith("bcftools:"): 3062 annotation_tool_initial = "bcftools" 3063 annotation_file = ":".join(annotation_file.split(":")[1:]) 3064 elif annotation_file.startswith("snpsift:"): 3065 annotation_tool_initial = "snpsift" 3066 annotation_file = ":".join(annotation_file.split(":")[1:]) 3067 else: 3068 annotation_tool_initial = None 3069 3070 # list of files 3071 annotation_file_list = annotation_file.replace("+", ":").split( 3072 ":" 3073 ) 3074 3075 for annotation_file in annotation_file_list: 3076 3077 if annotation_file: 3078 3079 # Annotation tool initial 3080 annotation_tool = annotation_tool_initial 3081 3082 # Find file 3083 annotation_file_found = None 3084 3085 # Expand user 3086 annotation_file = full_path(annotation_file) 3087 3088 if os.path.exists(annotation_file): 3089 annotation_file_found = annotation_file 3090 3091 else: 3092 # Find within assembly folders 3093 for annotations_database in annotations_databases: 3094 found_files = find_all( 3095 annotation_file, 3096 os.path.join( 3097 annotations_database, assembly 3098 ), 3099 ) 3100 if len(found_files) > 0: 3101 annotation_file_found = found_files[0] 3102 break 3103 if not annotation_file_found and not assembly: 3104 # Find within folders 3105 for ( 3106 annotations_database 3107 ) in annotations_databases: 3108 found_files = find_all( 3109 annotation_file, annotations_database 3110 ) 3111 if len(found_files) > 0: 3112 annotation_file_found = found_files[0] 3113 break 3114 log.debug( 3115 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3116 ) 3117 3118 # Full path 3119 annotation_file_found = full_path(annotation_file_found) 3120 3121 if annotation_file_found: 3122 3123 database = Database(database=annotation_file_found) 3124 quick_annotation_format = database.get_format() 3125 quick_annotation_is_compressed = ( 3126 database.is_compressed() 3127 ) 3128 quick_annotation_is_indexed = os.path.exists( 3129 f"{annotation_file_found}.tbi" 3130 ) 3131 bcftools_preference = False 3132 3133 # Check Annotation Tool 3134 if not annotation_tool: 3135 if ( 3136 bcftools_preference 3137 and quick_annotation_format 3138 in ["vcf", "bed"] 3139 and quick_annotation_is_compressed 3140 and quick_annotation_is_indexed 3141 ): 3142 annotation_tool = "bcftools" 3143 elif quick_annotation_format in [ 3144 "vcf", 3145 "bed", 3146 "tsv", 3147 "tsv", 3148 "csv", 3149 "json", 3150 "tbl", 3151 "parquet", 3152 "duckdb", 3153 ]: 3154 annotation_tool = "parquet" 3155 else: 3156 log.error( 3157 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3158 ) 3159 raise ValueError( 3160 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3161 ) 3162 3163 log.debug( 3164 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3165 ) 3166 3167 # Annotation Tool dispatch 3168 if annotation_tool: 3169 if annotation_tool not in param["annotation"]: 3170 param["annotation"][annotation_tool] = {} 3171 if ( 3172 "annotations" 3173 not in param["annotation"][annotation_tool] 3174 ): 3175 param["annotation"][annotation_tool][ 3176 "annotations" 3177 ] = {} 3178 param["annotation"][annotation_tool][ 3179 "annotations" 3180 ][annotation_file_found] = annotations 3181 3182 else: 3183 log.error( 3184 f"Quick Annotation File {annotation_file} does NOT exist" 3185 ) 3186 3187 self.set_param(param) 3188 3189 if param.get("annotation", None): 3190 log.info("Annotations") 3191 if param.get("annotation", {}).get("parquet", None): 3192 log.info("Annotations 'parquet'...") 3193 self.annotation_parquet() 3194 if param.get("annotation", {}).get("bcftools", None): 3195 log.info("Annotations 'bcftools'...") 3196 self.annotation_bcftools() 3197 if param.get("annotation", {}).get("snpsift", None): 3198 log.info("Annotations 'snpsift'...") 3199 self.annotation_snpsift() 3200 if param.get("annotation", {}).get("annovar", None): 3201 log.info("Annotations 'annovar'...") 3202 self.annotation_annovar() 3203 if param.get("annotation", {}).get("snpeff", None): 3204 log.info("Annotations 'snpeff'...") 3205 self.annotation_snpeff() 3206 if param.get("annotation", {}).get("exomiser", None) is not None: 3207 log.info("Annotations 'exomiser'...") 3208 self.annotation_exomiser() 3209 if param.get("annotation", {}).get("splice", None) is not None: 3210 log.info("Annotations 'splice' ...") 3211 self.annotation_splice() 3212 3213 # Explode INFOS fields into table fields 3214 if self.get_explode_infos(): 3215 self.explode_infos( 3216 prefix=self.get_explode_infos_prefix(), 3217 fields=self.get_explode_infos_fields(), 3218 force=True, 3219 )
It annotates the VCF file with the annotations specified in the config file.
3221 def annotation_snpsift(self, threads: int = None) -> None: 3222 """ 3223 This function annotate with bcftools 3224 3225 :param threads: Number of threads to use 3226 :return: the value of the variable "return_value". 3227 """ 3228 3229 # DEBUG 3230 log.debug("Start annotation with bcftools databases") 3231 3232 # Threads 3233 if not threads: 3234 threads = self.get_threads() 3235 log.debug("Threads: " + str(threads)) 3236 3237 # Config 3238 config = self.get_config() 3239 log.debug("Config: " + str(config)) 3240 3241 # Config - snpSift 3242 snpsift_bin_command = get_bin_command( 3243 bin="SnpSift.jar", 3244 tool="snpsift", 3245 bin_type="jar", 3246 config=config, 3247 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3248 ) 3249 if not snpsift_bin_command: 3250 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3251 log.error(msg_err) 3252 raise ValueError(msg_err) 3253 3254 # Config - bcftools 3255 bcftools_bin_command = get_bin_command( 3256 bin="bcftools", 3257 tool="bcftools", 3258 bin_type="bin", 3259 config=config, 3260 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3261 ) 3262 if not bcftools_bin_command: 3263 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3264 log.error(msg_err) 3265 raise ValueError(msg_err) 3266 3267 # Config - BCFTools databases folders 3268 databases_folders = set( 3269 self.get_config() 3270 .get("folders", {}) 3271 .get("databases", {}) 3272 .get("annotations", ["."]) 3273 + self.get_config() 3274 .get("folders", {}) 3275 .get("databases", {}) 3276 .get("bcftools", ["."]) 3277 ) 3278 log.debug("Databases annotations: " + str(databases_folders)) 3279 3280 # Param 3281 annotations = ( 3282 self.get_param() 3283 .get("annotation", {}) 3284 .get("snpsift", {}) 3285 .get("annotations", None) 3286 ) 3287 log.debug("Annotations: " + str(annotations)) 3288 3289 # Assembly 3290 assembly = self.get_param().get( 3291 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3292 ) 3293 3294 # Data 3295 table_variants = self.get_table_variants() 3296 3297 # Check if not empty 3298 log.debug("Check if not empty") 3299 sql_query_chromosomes = ( 3300 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3301 ) 3302 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3303 if not sql_query_chromosomes_df["count"][0]: 3304 log.info(f"VCF empty") 3305 return 3306 3307 # VCF header 3308 vcf_reader = self.get_header() 3309 log.debug("Initial header: " + str(vcf_reader.infos)) 3310 3311 # Existing annotations 3312 for vcf_annotation in self.get_header().infos: 3313 3314 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3315 log.debug( 3316 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3317 ) 3318 3319 if annotations: 3320 3321 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3322 3323 # Export VCF file 3324 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3325 3326 # Init 3327 commands = {} 3328 3329 for annotation in annotations: 3330 annotation_fields = annotations[annotation] 3331 3332 # Annotation Name 3333 annotation_name = os.path.basename(annotation) 3334 3335 if not annotation_fields: 3336 annotation_fields = {"INFO": None} 3337 3338 log.debug(f"Annotation '{annotation_name}'") 3339 log.debug( 3340 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3341 ) 3342 3343 # Create Database 3344 database = Database( 3345 database=annotation, 3346 databases_folders=databases_folders, 3347 assembly=assembly, 3348 ) 3349 3350 # Find files 3351 db_file = database.get_database() 3352 db_file = full_path(db_file) 3353 db_hdr_file = database.get_header_file() 3354 db_hdr_file = full_path(db_hdr_file) 3355 db_file_type = database.get_format() 3356 db_tbi_file = f"{db_file}.tbi" 3357 db_file_compressed = database.is_compressed() 3358 3359 # Check if compressed 3360 if not db_file_compressed: 3361 log.error( 3362 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3363 ) 3364 raise ValueError( 3365 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3366 ) 3367 3368 # Check if indexed 3369 if not os.path.exists(db_tbi_file): 3370 log.error( 3371 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3372 ) 3373 raise ValueError( 3374 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3375 ) 3376 3377 # Check index - try to create if not exists 3378 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3379 log.error("Annotation failed: database not valid") 3380 log.error(f"Annotation annotation file: {db_file}") 3381 log.error(f"Annotation annotation header: {db_hdr_file}") 3382 log.error(f"Annotation annotation index: {db_tbi_file}") 3383 raise ValueError( 3384 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3385 ) 3386 else: 3387 3388 log.debug( 3389 f"Annotation '{annotation}' - file: " 3390 + str(db_file) 3391 + " and " 3392 + str(db_hdr_file) 3393 ) 3394 3395 # Load header as VCF object 3396 db_hdr_vcf = Variants(input=db_hdr_file) 3397 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3398 log.debug( 3399 "Annotation database header: " 3400 + str(db_hdr_vcf_header_infos) 3401 ) 3402 3403 # For all fields in database 3404 annotation_fields_full = False 3405 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3406 annotation_fields = { 3407 key: key for key in db_hdr_vcf_header_infos 3408 } 3409 log.debug( 3410 "Annotation database header - All annotations added: " 3411 + str(annotation_fields) 3412 ) 3413 annotation_fields_full = True 3414 3415 # # Create file for field rename 3416 # log.debug("Create file for field rename") 3417 # tmp_rename = NamedTemporaryFile( 3418 # prefix=self.get_prefix(), 3419 # dir=self.get_tmp_dir(), 3420 # suffix=".rename", 3421 # delete=False, 3422 # ) 3423 # tmp_rename_name = tmp_rename.name 3424 # tmp_files.append(tmp_rename_name) 3425 3426 # Number of fields 3427 nb_annotation_field = 0 3428 annotation_list = [] 3429 annotation_infos_rename_list = [] 3430 3431 for annotation_field in annotation_fields: 3432 3433 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3434 annotation_fields_new_name = annotation_fields.get( 3435 annotation_field, annotation_field 3436 ) 3437 if not annotation_fields_new_name: 3438 annotation_fields_new_name = annotation_field 3439 3440 # Check if field is in DB and if field is not elready in input data 3441 if ( 3442 annotation_field in db_hdr_vcf.get_header().infos 3443 and annotation_fields_new_name 3444 not in self.get_header().infos 3445 ): 3446 3447 log.info( 3448 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3449 ) 3450 3451 # BCFTools annotate param to rename fields 3452 if annotation_field != annotation_fields_new_name: 3453 annotation_infos_rename_list.append( 3454 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3455 ) 3456 3457 # Add INFO field to header 3458 db_hdr_vcf_header_infos_number = ( 3459 db_hdr_vcf_header_infos[annotation_field].num or "." 3460 ) 3461 db_hdr_vcf_header_infos_type = ( 3462 db_hdr_vcf_header_infos[annotation_field].type 3463 or "String" 3464 ) 3465 db_hdr_vcf_header_infos_description = ( 3466 db_hdr_vcf_header_infos[annotation_field].desc 3467 or f"{annotation_field} description" 3468 ) 3469 db_hdr_vcf_header_infos_source = ( 3470 db_hdr_vcf_header_infos[annotation_field].source 3471 or "unknown" 3472 ) 3473 db_hdr_vcf_header_infos_version = ( 3474 db_hdr_vcf_header_infos[annotation_field].version 3475 or "unknown" 3476 ) 3477 3478 vcf_reader.infos[annotation_fields_new_name] = ( 3479 vcf.parser._Info( 3480 annotation_fields_new_name, 3481 db_hdr_vcf_header_infos_number, 3482 db_hdr_vcf_header_infos_type, 3483 db_hdr_vcf_header_infos_description, 3484 db_hdr_vcf_header_infos_source, 3485 db_hdr_vcf_header_infos_version, 3486 self.code_type_map[ 3487 db_hdr_vcf_header_infos_type 3488 ], 3489 ) 3490 ) 3491 3492 annotation_list.append(annotation_field) 3493 3494 nb_annotation_field += 1 3495 3496 else: 3497 3498 if ( 3499 annotation_field 3500 not in db_hdr_vcf.get_header().infos 3501 ): 3502 log.warning( 3503 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3504 ) 3505 if ( 3506 annotation_fields_new_name 3507 in self.get_header().infos 3508 ): 3509 log.warning( 3510 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3511 ) 3512 3513 log.info( 3514 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3515 ) 3516 3517 annotation_infos = ",".join(annotation_list) 3518 3519 if annotation_infos != "": 3520 3521 # Annotated VCF (and error file) 3522 tmp_annotation_vcf_name = os.path.join( 3523 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3524 ) 3525 tmp_annotation_vcf_name_err = ( 3526 tmp_annotation_vcf_name + ".err" 3527 ) 3528 3529 # Add fields to annotate 3530 if not annotation_fields_full: 3531 annotation_infos_option = f"-info {annotation_infos}" 3532 else: 3533 annotation_infos_option = "" 3534 3535 # Info fields rename 3536 if annotation_infos_rename_list: 3537 annotation_infos_rename = " -c " + ",".join( 3538 annotation_infos_rename_list 3539 ) 3540 else: 3541 annotation_infos_rename = "" 3542 3543 # Annotate command 3544 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3545 3546 # Add command 3547 commands[command_annotate] = tmp_annotation_vcf_name 3548 3549 if commands: 3550 3551 # Export VCF file 3552 self.export_variant_vcf( 3553 vcf_file=tmp_vcf_name, 3554 remove_info=True, 3555 add_samples=False, 3556 index=True, 3557 ) 3558 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3559 3560 # Num command 3561 nb_command = 0 3562 3563 # Annotate 3564 for command_annotate in commands: 3565 nb_command += 1 3566 log.info( 3567 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3568 ) 3569 log.debug(f"command_annotate={command_annotate}") 3570 run_parallel_commands([command_annotate], threads) 3571 3572 # Debug 3573 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3574 3575 # Update variants 3576 log.info( 3577 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3578 ) 3579 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3581 def annotation_bcftools(self, threads: int = None) -> None: 3582 """ 3583 This function annotate with bcftools 3584 3585 :param threads: Number of threads to use 3586 :return: the value of the variable "return_value". 3587 """ 3588 3589 # DEBUG 3590 log.debug("Start annotation with bcftools databases") 3591 3592 # Threads 3593 if not threads: 3594 threads = self.get_threads() 3595 log.debug("Threads: " + str(threads)) 3596 3597 # Config 3598 config = self.get_config() 3599 log.debug("Config: " + str(config)) 3600 3601 # DEBUG 3602 delete_tmp = True 3603 if self.get_config().get("verbosity", "warning") in ["debug"]: 3604 delete_tmp = False 3605 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3606 3607 # Config - BCFTools bin command 3608 bcftools_bin_command = get_bin_command( 3609 bin="bcftools", 3610 tool="bcftools", 3611 bin_type="bin", 3612 config=config, 3613 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3614 ) 3615 if not bcftools_bin_command: 3616 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3617 log.error(msg_err) 3618 raise ValueError(msg_err) 3619 3620 # Config - BCFTools databases folders 3621 databases_folders = set( 3622 self.get_config() 3623 .get("folders", {}) 3624 .get("databases", {}) 3625 .get("annotations", ["."]) 3626 + self.get_config() 3627 .get("folders", {}) 3628 .get("databases", {}) 3629 .get("bcftools", ["."]) 3630 ) 3631 log.debug("Databases annotations: " + str(databases_folders)) 3632 3633 # Param 3634 annotations = ( 3635 self.get_param() 3636 .get("annotation", {}) 3637 .get("bcftools", {}) 3638 .get("annotations", None) 3639 ) 3640 log.debug("Annotations: " + str(annotations)) 3641 3642 # Assembly 3643 assembly = self.get_param().get( 3644 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3645 ) 3646 3647 # Data 3648 table_variants = self.get_table_variants() 3649 3650 # Check if not empty 3651 log.debug("Check if not empty") 3652 sql_query_chromosomes = ( 3653 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3654 ) 3655 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3656 if not sql_query_chromosomes_df["count"][0]: 3657 log.info(f"VCF empty") 3658 return 3659 3660 # Export in VCF 3661 log.debug("Create initial file to annotate") 3662 tmp_vcf = NamedTemporaryFile( 3663 prefix=self.get_prefix(), 3664 dir=self.get_tmp_dir(), 3665 suffix=".vcf.gz", 3666 delete=False, 3667 ) 3668 tmp_vcf_name = tmp_vcf.name 3669 3670 # VCF header 3671 vcf_reader = self.get_header() 3672 log.debug("Initial header: " + str(vcf_reader.infos)) 3673 3674 # Existing annotations 3675 for vcf_annotation in self.get_header().infos: 3676 3677 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3678 log.debug( 3679 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3680 ) 3681 3682 if annotations: 3683 3684 tmp_ann_vcf_list = [] 3685 commands = [] 3686 tmp_files = [] 3687 err_files = [] 3688 3689 for annotation in annotations: 3690 annotation_fields = annotations[annotation] 3691 3692 # Annotation Name 3693 annotation_name = os.path.basename(annotation) 3694 3695 if not annotation_fields: 3696 annotation_fields = {"INFO": None} 3697 3698 log.debug(f"Annotation '{annotation_name}'") 3699 log.debug( 3700 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3701 ) 3702 3703 # Create Database 3704 database = Database( 3705 database=annotation, 3706 databases_folders=databases_folders, 3707 assembly=assembly, 3708 ) 3709 3710 # Find files 3711 db_file = database.get_database() 3712 db_file = full_path(db_file) 3713 db_hdr_file = database.get_header_file() 3714 db_hdr_file = full_path(db_hdr_file) 3715 db_file_type = database.get_format() 3716 db_tbi_file = f"{db_file}.tbi" 3717 db_file_compressed = database.is_compressed() 3718 3719 # Check if compressed 3720 if not db_file_compressed: 3721 log.error( 3722 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3723 ) 3724 raise ValueError( 3725 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3726 ) 3727 3728 # Check if indexed 3729 if not os.path.exists(db_tbi_file): 3730 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3731 raise ValueError( 3732 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3733 ) 3734 3735 # Check index - try to create if not exists 3736 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3737 log.error("Annotation failed: database not valid") 3738 log.error(f"Annotation annotation file: {db_file}") 3739 log.error(f"Annotation annotation header: {db_hdr_file}") 3740 log.error(f"Annotation annotation index: {db_tbi_file}") 3741 raise ValueError( 3742 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3743 ) 3744 else: 3745 3746 log.debug( 3747 f"Annotation '{annotation}' - file: " 3748 + str(db_file) 3749 + " and " 3750 + str(db_hdr_file) 3751 ) 3752 3753 # Load header as VCF object 3754 db_hdr_vcf = Variants(input=db_hdr_file) 3755 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3756 log.debug( 3757 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3758 ) 3759 3760 # For all fields in database 3761 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3762 annotation_fields = { 3763 key: key for key in db_hdr_vcf_header_infos 3764 } 3765 log.debug( 3766 "Annotation database header - All annotations added: " 3767 + str(annotation_fields) 3768 ) 3769 3770 # Number of fields 3771 nb_annotation_field = 0 3772 annotation_list = [] 3773 3774 for annotation_field in annotation_fields: 3775 3776 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3777 annotation_fields_new_name = annotation_fields.get( 3778 annotation_field, annotation_field 3779 ) 3780 if not annotation_fields_new_name: 3781 annotation_fields_new_name = annotation_field 3782 3783 # Check if field is in DB and if field is not elready in input data 3784 if ( 3785 annotation_field in db_hdr_vcf.get_header().infos 3786 and annotation_fields_new_name 3787 not in self.get_header().infos 3788 ): 3789 3790 log.info( 3791 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3792 ) 3793 3794 # Add INFO field to header 3795 db_hdr_vcf_header_infos_number = ( 3796 db_hdr_vcf_header_infos[annotation_field].num or "." 3797 ) 3798 db_hdr_vcf_header_infos_type = ( 3799 db_hdr_vcf_header_infos[annotation_field].type 3800 or "String" 3801 ) 3802 db_hdr_vcf_header_infos_description = ( 3803 db_hdr_vcf_header_infos[annotation_field].desc 3804 or f"{annotation_field} description" 3805 ) 3806 db_hdr_vcf_header_infos_source = ( 3807 db_hdr_vcf_header_infos[annotation_field].source 3808 or "unknown" 3809 ) 3810 db_hdr_vcf_header_infos_version = ( 3811 db_hdr_vcf_header_infos[annotation_field].version 3812 or "unknown" 3813 ) 3814 3815 vcf_reader.infos[annotation_fields_new_name] = ( 3816 vcf.parser._Info( 3817 annotation_fields_new_name, 3818 db_hdr_vcf_header_infos_number, 3819 db_hdr_vcf_header_infos_type, 3820 db_hdr_vcf_header_infos_description, 3821 db_hdr_vcf_header_infos_source, 3822 db_hdr_vcf_header_infos_version, 3823 self.code_type_map[db_hdr_vcf_header_infos_type], 3824 ) 3825 ) 3826 3827 # annotation_list.append(annotation_field) 3828 if annotation_field != annotation_fields_new_name: 3829 annotation_list.append( 3830 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3831 ) 3832 else: 3833 annotation_list.append(annotation_field) 3834 3835 nb_annotation_field += 1 3836 3837 else: 3838 3839 if annotation_field not in db_hdr_vcf.get_header().infos: 3840 log.warning( 3841 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3842 ) 3843 if annotation_fields_new_name in self.get_header().infos: 3844 log.warning( 3845 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3846 ) 3847 3848 log.info( 3849 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3850 ) 3851 3852 annotation_infos = ",".join(annotation_list) 3853 3854 if annotation_infos != "": 3855 3856 # Protect header for bcftools (remove "#CHROM" and variants line) 3857 log.debug("Protect Header file - remove #CHROM line if exists") 3858 tmp_header_vcf = NamedTemporaryFile( 3859 prefix=self.get_prefix(), 3860 dir=self.get_tmp_dir(), 3861 suffix=".hdr", 3862 delete=False, 3863 ) 3864 tmp_header_vcf_name = tmp_header_vcf.name 3865 tmp_files.append(tmp_header_vcf_name) 3866 # Command 3867 if db_hdr_file.endswith(".gz"): 3868 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3869 else: 3870 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3871 # Run 3872 run_parallel_commands([command_extract_header], 1) 3873 3874 # Find chomosomes 3875 log.debug("Find chromosomes ") 3876 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3877 sql_query_chromosomes_df = self.get_query_to_df( 3878 sql_query_chromosomes 3879 ) 3880 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3881 3882 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3883 3884 # BED columns in the annotation file 3885 if db_file_type in ["bed"]: 3886 annotation_infos = "CHROM,POS,POS," + annotation_infos 3887 3888 for chrom in chomosomes_list: 3889 3890 # Create BED on initial VCF 3891 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3892 tmp_bed = NamedTemporaryFile( 3893 prefix=self.get_prefix(), 3894 dir=self.get_tmp_dir(), 3895 suffix=".bed", 3896 delete=False, 3897 ) 3898 tmp_bed_name = tmp_bed.name 3899 tmp_files.append(tmp_bed_name) 3900 3901 # Detecte regions 3902 log.debug( 3903 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3904 ) 3905 window = 1000000 3906 sql_query_intervals_for_bed = f""" 3907 SELECT \"#CHROM\", 3908 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3909 \"POS\"+{window} 3910 FROM {table_variants} as table_variants 3911 WHERE table_variants.\"#CHROM\" = '{chrom}' 3912 """ 3913 regions = self.conn.execute( 3914 sql_query_intervals_for_bed 3915 ).fetchall() 3916 merged_regions = merge_regions(regions) 3917 log.debug( 3918 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3919 ) 3920 3921 header = ["#CHROM", "START", "END"] 3922 with open(tmp_bed_name, "w") as f: 3923 # Write the header with tab delimiter 3924 f.write("\t".join(header) + "\n") 3925 for d in merged_regions: 3926 # Write each data row with tab delimiter 3927 f.write("\t".join(map(str, d)) + "\n") 3928 3929 # Tmp files 3930 tmp_annotation_vcf = NamedTemporaryFile( 3931 prefix=self.get_prefix(), 3932 dir=self.get_tmp_dir(), 3933 suffix=".vcf.gz", 3934 delete=False, 3935 ) 3936 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3937 tmp_files.append(tmp_annotation_vcf_name) 3938 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3939 tmp_annotation_vcf_name_err = ( 3940 tmp_annotation_vcf_name + ".err" 3941 ) 3942 err_files.append(tmp_annotation_vcf_name_err) 3943 3944 # Annotate Command 3945 log.debug( 3946 f"Annotation '{annotation}' - add bcftools command" 3947 ) 3948 3949 # Command 3950 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3951 3952 # Add command 3953 commands.append(command_annotate) 3954 3955 # if some commands 3956 if commands: 3957 3958 # Export VCF file 3959 self.export_variant_vcf( 3960 vcf_file=tmp_vcf_name, 3961 remove_info=True, 3962 add_samples=False, 3963 index=True, 3964 ) 3965 3966 # Threads 3967 # calculate threads for annotated commands 3968 if commands: 3969 threads_bcftools_annotate = round(threads / len(commands)) 3970 else: 3971 threads_bcftools_annotate = 1 3972 3973 if not threads_bcftools_annotate: 3974 threads_bcftools_annotate = 1 3975 3976 # Add threads option to bcftools commands 3977 if threads_bcftools_annotate > 1: 3978 commands_threaded = [] 3979 for command in commands: 3980 commands_threaded.append( 3981 command.replace( 3982 f"{bcftools_bin_command} annotate ", 3983 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3984 ) 3985 ) 3986 commands = commands_threaded 3987 3988 # Command annotation multithreading 3989 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3990 log.info( 3991 f"Annotation - Annotation multithreaded in " 3992 + str(len(commands)) 3993 + " commands" 3994 ) 3995 3996 run_parallel_commands(commands, threads) 3997 3998 # Merge 3999 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4000 4001 if tmp_ann_vcf_list_cmd: 4002 4003 # Tmp file 4004 tmp_annotate_vcf = NamedTemporaryFile( 4005 prefix=self.get_prefix(), 4006 dir=self.get_tmp_dir(), 4007 suffix=".vcf.gz", 4008 delete=True, 4009 ) 4010 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4011 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4012 err_files.append(tmp_annotate_vcf_name_err) 4013 4014 # Tmp file remove command 4015 tmp_files_remove_command = "" 4016 if tmp_files: 4017 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4018 4019 # Command merge 4020 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4021 log.info( 4022 f"Annotation - Annotation merging " 4023 + str(len(commands)) 4024 + " annotated files" 4025 ) 4026 log.debug(f"Annotation - merge command: {merge_command}") 4027 run_parallel_commands([merge_command], 1) 4028 4029 # Error messages 4030 log.info(f"Error/Warning messages:") 4031 error_message_command_all = [] 4032 error_message_command_warning = [] 4033 error_message_command_err = [] 4034 for err_file in err_files: 4035 with open(err_file, "r") as f: 4036 for line in f: 4037 message = line.strip() 4038 error_message_command_all.append(message) 4039 if line.startswith("[W::"): 4040 error_message_command_warning.append(message) 4041 if line.startswith("[E::"): 4042 error_message_command_err.append( 4043 f"{err_file}: " + message 4044 ) 4045 # log info 4046 for message in list( 4047 set(error_message_command_err + error_message_command_warning) 4048 ): 4049 log.info(f" {message}") 4050 # debug info 4051 for message in list(set(error_message_command_all)): 4052 log.debug(f" {message}") 4053 # failed 4054 if len(error_message_command_err): 4055 log.error("Annotation failed: Error in commands") 4056 raise ValueError("Annotation failed: Error in commands") 4057 4058 # Update variants 4059 log.info(f"Annotation - Updating...") 4060 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
4062 def annotation_exomiser(self, threads: int = None) -> None: 4063 """ 4064 This function annotate with Exomiser 4065 4066 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4067 - "analysis" (dict/file): 4068 Full analysis dictionnary parameters (see Exomiser docs). 4069 Either a dict, or a file in JSON or YAML format. 4070 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4071 Default : None 4072 - "preset" (string): 4073 Analysis preset (available in config folder). 4074 Used if no full "analysis" is provided. 4075 Default: "exome" 4076 - "phenopacket" (dict/file): 4077 Samples and phenotipic features parameters (see Exomiser docs). 4078 Either a dict, or a file in JSON or YAML format. 4079 Default: None 4080 - "subject" (dict): 4081 Sample parameters (see Exomiser docs). 4082 Example: 4083 "subject": 4084 { 4085 "id": "ISDBM322017", 4086 "sex": "FEMALE" 4087 } 4088 Default: None 4089 - "sample" (string): 4090 Sample name to construct "subject" section: 4091 "subject": 4092 { 4093 "id": "<sample>", 4094 "sex": "UNKNOWN_SEX" 4095 } 4096 Default: None 4097 - "phenotypicFeatures" (dict) 4098 Phenotypic features to construct "subject" section. 4099 Example: 4100 "phenotypicFeatures": 4101 [ 4102 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4103 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4104 ] 4105 - "hpo" (list) 4106 List of HPO ids as phenotypic features. 4107 Example: 4108 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4109 Default: [] 4110 - "outputOptions" (dict): 4111 Output options (see Exomiser docs). 4112 Default: 4113 "output_options" = 4114 { 4115 "outputContributingVariantsOnly": False, 4116 "numGenes": 0, 4117 "outputFormats": ["TSV_VARIANT", "VCF"] 4118 } 4119 - "transcript_source" (string): 4120 Transcript source (either "refseq", "ucsc", "ensembl") 4121 Default: "refseq" 4122 - "exomiser_to_info" (boolean): 4123 Add exomiser TSV file columns as INFO fields in VCF. 4124 Default: False 4125 - "release" (string): 4126 Exomise database release. 4127 If not exists, database release will be downloaded (take a while). 4128 Default: None (provided by application.properties configuration file) 4129 - "exomiser_application_properties" (file): 4130 Exomiser configuration file (see Exomiser docs). 4131 Useful to automatically download databases (especially for specific genome databases). 4132 4133 Notes: 4134 - If no sample in parameters, first sample in VCF will be chosen 4135 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4136 4137 :param threads: The number of threads to use 4138 :return: None. 4139 """ 4140 4141 # DEBUG 4142 log.debug("Start annotation with Exomiser databases") 4143 4144 # Threads 4145 if not threads: 4146 threads = self.get_threads() 4147 log.debug("Threads: " + str(threads)) 4148 4149 # Config 4150 config = self.get_config() 4151 log.debug("Config: " + str(config)) 4152 4153 # Config - Folders - Databases 4154 databases_folders = ( 4155 config.get("folders", {}) 4156 .get("databases", {}) 4157 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4158 ) 4159 databases_folders = full_path(databases_folders) 4160 if not os.path.exists(databases_folders): 4161 log.error(f"Databases annotations: {databases_folders} NOT found") 4162 log.debug("Databases annotations: " + str(databases_folders)) 4163 4164 # Config - Exomiser 4165 exomiser_bin_command = get_bin_command( 4166 bin="exomiser-cli*.jar", 4167 tool="exomiser", 4168 bin_type="jar", 4169 config=config, 4170 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4171 ) 4172 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4173 if not exomiser_bin_command: 4174 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4175 log.error(msg_err) 4176 raise ValueError(msg_err) 4177 4178 # Param 4179 param = self.get_param() 4180 log.debug("Param: " + str(param)) 4181 4182 # Param - Exomiser 4183 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4184 log.debug(f"Param Exomiser: {param_exomiser}") 4185 4186 # Param - Assembly 4187 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4188 log.debug("Assembly: " + str(assembly)) 4189 4190 # Data 4191 table_variants = self.get_table_variants() 4192 4193 # Check if not empty 4194 log.debug("Check if not empty") 4195 sql_query_chromosomes = ( 4196 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4197 ) 4198 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4199 log.info(f"VCF empty") 4200 return False 4201 4202 # VCF header 4203 vcf_reader = self.get_header() 4204 log.debug("Initial header: " + str(vcf_reader.infos)) 4205 4206 # Samples 4207 samples = self.get_header_sample_list() 4208 if not samples: 4209 log.error("No Samples in VCF") 4210 return False 4211 log.debug(f"Samples: {samples}") 4212 4213 # Memory limit 4214 memory_limit = self.get_memory("8G") 4215 log.debug(f"memory_limit: {memory_limit}") 4216 4217 # Exomiser java options 4218 exomiser_java_options = ( 4219 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4220 ) 4221 log.debug(f"Exomiser java options: {exomiser_java_options}") 4222 4223 # Download Exomiser (if not exists) 4224 exomiser_release = param_exomiser.get("release", None) 4225 exomiser_application_properties = param_exomiser.get( 4226 "exomiser_application_properties", None 4227 ) 4228 databases_download_exomiser( 4229 assemblies=[assembly], 4230 exomiser_folder=databases_folders, 4231 exomiser_release=exomiser_release, 4232 exomiser_phenotype_release=exomiser_release, 4233 exomiser_application_properties=exomiser_application_properties, 4234 ) 4235 4236 # Force annotation 4237 force_update_annotation = True 4238 4239 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4240 log.debug("Start annotation Exomiser") 4241 4242 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4243 4244 # tmp_dir = "/tmp/exomiser" 4245 4246 ### ANALYSIS ### 4247 ################ 4248 4249 # Create analysis.json through analysis dict 4250 # either analysis in param or by default 4251 # depending on preset exome/genome) 4252 4253 # Init analysis dict 4254 param_exomiser_analysis_dict = {} 4255 4256 # analysis from param 4257 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4258 param_exomiser_analysis = full_path(param_exomiser_analysis) 4259 4260 # If analysis in param -> load anlaysis json 4261 if param_exomiser_analysis: 4262 4263 # If param analysis is a file and exists 4264 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4265 param_exomiser_analysis 4266 ): 4267 # Load analysis file into analysis dict (either yaml or json) 4268 with open(param_exomiser_analysis) as json_file: 4269 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4270 4271 # If param analysis is a dict 4272 elif isinstance(param_exomiser_analysis, dict): 4273 # Load analysis dict into analysis dict (either yaml or json) 4274 param_exomiser_analysis_dict = param_exomiser_analysis 4275 4276 # Error analysis type 4277 else: 4278 log.error(f"Analysis type unknown. Check param file.") 4279 raise ValueError(f"Analysis type unknown. Check param file.") 4280 4281 # Case no input analysis config file/dict 4282 # Use preset (exome/genome) to open default config file 4283 if not param_exomiser_analysis_dict: 4284 4285 # default preset 4286 default_preset = "exome" 4287 4288 # Get param preset or default preset 4289 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4290 4291 # Try to find if preset is a file 4292 if os.path.exists(param_exomiser_preset): 4293 # Preset file is provided in full path 4294 param_exomiser_analysis_default_config_file = ( 4295 param_exomiser_preset 4296 ) 4297 # elif os.path.exists(full_path(param_exomiser_preset)): 4298 # # Preset file is provided in full path 4299 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4300 elif os.path.exists( 4301 os.path.join(folder_config, param_exomiser_preset) 4302 ): 4303 # Preset file is provided a basename in config folder (can be a path with subfolders) 4304 param_exomiser_analysis_default_config_file = os.path.join( 4305 folder_config, param_exomiser_preset 4306 ) 4307 else: 4308 # Construct preset file 4309 param_exomiser_analysis_default_config_file = os.path.join( 4310 folder_config, 4311 f"preset-{param_exomiser_preset}-analysis.json", 4312 ) 4313 4314 # If preset file exists 4315 param_exomiser_analysis_default_config_file = full_path( 4316 param_exomiser_analysis_default_config_file 4317 ) 4318 if os.path.exists(param_exomiser_analysis_default_config_file): 4319 # Load prest file into analysis dict (either yaml or json) 4320 with open( 4321 param_exomiser_analysis_default_config_file 4322 ) as json_file: 4323 # param_exomiser_analysis_dict[""] = json.load(json_file) 4324 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4325 json_file 4326 ) 4327 4328 # Error preset file 4329 else: 4330 log.error( 4331 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4332 ) 4333 raise ValueError( 4334 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4335 ) 4336 4337 # If no analysis dict created 4338 if not param_exomiser_analysis_dict: 4339 log.error(f"No analysis config") 4340 raise ValueError(f"No analysis config") 4341 4342 # Log 4343 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4344 4345 ### PHENOPACKET ### 4346 ################### 4347 4348 # If no PhenoPacket in analysis dict -> check in param 4349 if "phenopacket" not in param_exomiser_analysis_dict: 4350 4351 # If PhenoPacket in param -> load anlaysis json 4352 if param_exomiser.get("phenopacket", None): 4353 4354 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4355 param_exomiser_phenopacket = full_path( 4356 param_exomiser_phenopacket 4357 ) 4358 4359 # If param phenopacket is a file and exists 4360 if isinstance( 4361 param_exomiser_phenopacket, str 4362 ) and os.path.exists(param_exomiser_phenopacket): 4363 # Load phenopacket file into analysis dict (either yaml or json) 4364 with open(param_exomiser_phenopacket) as json_file: 4365 param_exomiser_analysis_dict["phenopacket"] = ( 4366 yaml.safe_load(json_file) 4367 ) 4368 4369 # If param phenopacket is a dict 4370 elif isinstance(param_exomiser_phenopacket, dict): 4371 # Load phenopacket dict into analysis dict (either yaml or json) 4372 param_exomiser_analysis_dict["phenopacket"] = ( 4373 param_exomiser_phenopacket 4374 ) 4375 4376 # Error phenopacket type 4377 else: 4378 log.error(f"Phenopacket type unknown. Check param file.") 4379 raise ValueError( 4380 f"Phenopacket type unknown. Check param file." 4381 ) 4382 4383 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4384 if "phenopacket" not in param_exomiser_analysis_dict: 4385 4386 # Init PhenoPacket 4387 param_exomiser_analysis_dict["phenopacket"] = { 4388 "id": "analysis", 4389 "proband": {}, 4390 } 4391 4392 ### Add subject ### 4393 4394 # If subject exists 4395 param_exomiser_subject = param_exomiser.get("subject", {}) 4396 4397 # If subject not exists -> found sample ID 4398 if not param_exomiser_subject: 4399 4400 # Found sample ID in param 4401 sample = param_exomiser.get("sample", None) 4402 4403 # Find sample ID (first sample) 4404 if not sample: 4405 sample_list = self.get_header_sample_list() 4406 if len(sample_list) > 0: 4407 sample = sample_list[0] 4408 else: 4409 log.error(f"No sample found") 4410 raise ValueError(f"No sample found") 4411 4412 # Create subject 4413 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4414 4415 # Add to dict 4416 param_exomiser_analysis_dict["phenopacket"][ 4417 "subject" 4418 ] = param_exomiser_subject 4419 4420 ### Add "phenotypicFeatures" ### 4421 4422 # If phenotypicFeatures exists 4423 param_exomiser_phenotypicfeatures = param_exomiser.get( 4424 "phenotypicFeatures", [] 4425 ) 4426 4427 # If phenotypicFeatures not exists -> Try to infer from hpo list 4428 if not param_exomiser_phenotypicfeatures: 4429 4430 # Found HPO in param 4431 param_exomiser_hpo = param_exomiser.get("hpo", []) 4432 4433 # Split HPO if list in string format separated by comma 4434 if isinstance(param_exomiser_hpo, str): 4435 param_exomiser_hpo = param_exomiser_hpo.split(",") 4436 4437 # Create HPO list 4438 for hpo in param_exomiser_hpo: 4439 hpo_clean = re.sub("[^0-9]", "", hpo) 4440 param_exomiser_phenotypicfeatures.append( 4441 { 4442 "type": { 4443 "id": f"HP:{hpo_clean}", 4444 "label": f"HP:{hpo_clean}", 4445 } 4446 } 4447 ) 4448 4449 # Add to dict 4450 param_exomiser_analysis_dict["phenopacket"][ 4451 "phenotypicFeatures" 4452 ] = param_exomiser_phenotypicfeatures 4453 4454 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4455 if not param_exomiser_phenotypicfeatures: 4456 for step in param_exomiser_analysis_dict.get( 4457 "analysis", {} 4458 ).get("steps", []): 4459 if "hiPhivePrioritiser" in step: 4460 param_exomiser_analysis_dict.get("analysis", {}).get( 4461 "steps", [] 4462 ).remove(step) 4463 4464 ### Add Input File ### 4465 4466 # Initial file name and htsFiles 4467 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4468 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4469 { 4470 "uri": tmp_vcf_name, 4471 "htsFormat": "VCF", 4472 "genomeAssembly": assembly, 4473 } 4474 ] 4475 4476 ### Add metaData ### 4477 4478 # If metaData not in analysis dict 4479 if "metaData" not in param_exomiser_analysis_dict: 4480 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4481 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4482 "createdBy": "howard", 4483 "phenopacketSchemaVersion": 1, 4484 } 4485 4486 ### OutputOptions ### 4487 4488 # Init output result folder 4489 output_results = os.path.join(tmp_dir, "results") 4490 4491 # If no outputOptions in analysis dict 4492 if "outputOptions" not in param_exomiser_analysis_dict: 4493 4494 # default output formats 4495 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4496 4497 # Get outputOptions in param 4498 output_options = param_exomiser.get("outputOptions", None) 4499 4500 # If no output_options in param -> check 4501 if not output_options: 4502 output_options = { 4503 "outputContributingVariantsOnly": False, 4504 "numGenes": 0, 4505 "outputFormats": defaut_output_formats, 4506 } 4507 4508 # Replace outputDirectory in output options 4509 output_options["outputDirectory"] = output_results 4510 output_options["outputFileName"] = "howard" 4511 4512 # Add outputOptions in analysis dict 4513 param_exomiser_analysis_dict["outputOptions"] = output_options 4514 4515 else: 4516 4517 # Replace output_results and output format (if exists in param) 4518 param_exomiser_analysis_dict["outputOptions"][ 4519 "outputDirectory" 4520 ] = output_results 4521 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4522 list( 4523 set( 4524 param_exomiser_analysis_dict.get( 4525 "outputOptions", {} 4526 ).get("outputFormats", []) 4527 + ["TSV_VARIANT", "VCF"] 4528 ) 4529 ) 4530 ) 4531 4532 # log 4533 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4534 4535 ### ANALYSIS FILE ### 4536 ##################### 4537 4538 ### Full JSON analysis config file ### 4539 4540 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4541 with open(exomiser_analysis, "w") as fp: 4542 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4543 4544 ### SPLIT analysis and sample config files 4545 4546 # Splitted analysis dict 4547 param_exomiser_analysis_dict_for_split = ( 4548 param_exomiser_analysis_dict.copy() 4549 ) 4550 4551 # Phenopacket JSON file 4552 exomiser_analysis_phenopacket = os.path.join( 4553 tmp_dir, "analysis_phenopacket.json" 4554 ) 4555 with open(exomiser_analysis_phenopacket, "w") as fp: 4556 json.dump( 4557 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4558 fp, 4559 indent=4, 4560 ) 4561 4562 # Analysis JSON file without Phenopacket parameters 4563 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4564 exomiser_analysis_analysis = os.path.join( 4565 tmp_dir, "analysis_analysis.json" 4566 ) 4567 with open(exomiser_analysis_analysis, "w") as fp: 4568 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4569 4570 ### INITAL VCF file ### 4571 ####################### 4572 4573 ### Create list of samples to use and include inti initial VCF file #### 4574 4575 # Subject (main sample) 4576 # Get sample ID in analysis dict 4577 sample_subject = ( 4578 param_exomiser_analysis_dict.get("phenopacket", {}) 4579 .get("subject", {}) 4580 .get("id", None) 4581 ) 4582 sample_proband = ( 4583 param_exomiser_analysis_dict.get("phenopacket", {}) 4584 .get("proband", {}) 4585 .get("subject", {}) 4586 .get("id", None) 4587 ) 4588 sample = [] 4589 if sample_subject: 4590 sample.append(sample_subject) 4591 if sample_proband: 4592 sample.append(sample_proband) 4593 4594 # Get sample ID within Pedigree 4595 pedigree_persons_list = ( 4596 param_exomiser_analysis_dict.get("phenopacket", {}) 4597 .get("pedigree", {}) 4598 .get("persons", {}) 4599 ) 4600 4601 # Create list with all sample ID in pedigree (if exists) 4602 pedigree_persons = [] 4603 for person in pedigree_persons_list: 4604 pedigree_persons.append(person.get("individualId")) 4605 4606 # Concat subject sample ID and samples ID in pedigreesamples 4607 samples = list(set(sample + pedigree_persons)) 4608 4609 # Check if sample list is not empty 4610 if not samples: 4611 log.error(f"No samples found") 4612 raise ValueError(f"No samples found") 4613 4614 # Create VCF with sample (either sample in param or first one by default) 4615 # Export VCF file 4616 self.export_variant_vcf( 4617 vcf_file=tmp_vcf_name, 4618 remove_info=True, 4619 add_samples=True, 4620 list_samples=samples, 4621 index=False, 4622 ) 4623 4624 ### Execute Exomiser ### 4625 ######################## 4626 4627 # Init command 4628 exomiser_command = "" 4629 4630 # Command exomiser options 4631 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4632 4633 # Release 4634 exomiser_release = param_exomiser.get("release", None) 4635 if exomiser_release: 4636 # phenotype data version 4637 exomiser_options += ( 4638 f" --exomiser.phenotype.data-version={exomiser_release} " 4639 ) 4640 # data version 4641 exomiser_options += ( 4642 f" --exomiser.{assembly}.data-version={exomiser_release} " 4643 ) 4644 # variant white list 4645 variant_white_list_file = ( 4646 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4647 ) 4648 if os.path.exists( 4649 os.path.join( 4650 databases_folders, assembly, variant_white_list_file 4651 ) 4652 ): 4653 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4654 4655 # transcript_source 4656 transcript_source = param_exomiser.get( 4657 "transcript_source", None 4658 ) # ucsc, refseq, ensembl 4659 if transcript_source: 4660 exomiser_options += ( 4661 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4662 ) 4663 4664 # If analysis contain proband param 4665 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4666 "proband", {} 4667 ): 4668 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4669 4670 # If no proband (usually uniq sample) 4671 else: 4672 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4673 4674 # Log 4675 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4676 4677 # Run command 4678 result = subprocess.call( 4679 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4680 ) 4681 if result: 4682 log.error("Exomiser command failed") 4683 raise ValueError("Exomiser command failed") 4684 4685 ### RESULTS ### 4686 ############### 4687 4688 ### Annotate with TSV fields ### 4689 4690 # Init result tsv file 4691 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4692 4693 # Init result tsv file 4694 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4695 4696 # Parse TSV file and explode columns in INFO field 4697 if exomiser_to_info and os.path.exists(output_results_tsv): 4698 4699 # Log 4700 log.debug("Exomiser columns to VCF INFO field") 4701 4702 # Retrieve columns and types 4703 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4704 output_results_tsv_df = self.get_query_to_df(query) 4705 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4706 4707 # Init concat fields for update 4708 sql_query_update_concat_fields = [] 4709 4710 # Fields to avoid 4711 fields_to_avoid = [ 4712 "CONTIG", 4713 "START", 4714 "END", 4715 "REF", 4716 "ALT", 4717 "QUAL", 4718 "FILTER", 4719 "GENOTYPE", 4720 ] 4721 4722 # List all columns to add into header 4723 for header_column in output_results_tsv_columns: 4724 4725 # If header column is enable 4726 if header_column not in fields_to_avoid: 4727 4728 # Header info type 4729 header_info_type = "String" 4730 header_column_df = output_results_tsv_df[header_column] 4731 header_column_df_dtype = header_column_df.dtype 4732 if header_column_df_dtype == object: 4733 if ( 4734 pd.to_numeric(header_column_df, errors="coerce") 4735 .notnull() 4736 .all() 4737 ): 4738 header_info_type = "Float" 4739 else: 4740 header_info_type = "Integer" 4741 4742 # Header info 4743 characters_to_validate = ["-"] 4744 pattern = "[" + "".join(characters_to_validate) + "]" 4745 header_info_name = re.sub( 4746 pattern, 4747 "_", 4748 f"Exomiser_{header_column}".replace("#", ""), 4749 ) 4750 header_info_number = "." 4751 header_info_description = ( 4752 f"Exomiser {header_column} annotation" 4753 ) 4754 header_info_source = "Exomiser" 4755 header_info_version = "unknown" 4756 header_info_code = CODE_TYPE_MAP[header_info_type] 4757 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4758 header_info_name, 4759 header_info_number, 4760 header_info_type, 4761 header_info_description, 4762 header_info_source, 4763 header_info_version, 4764 header_info_code, 4765 ) 4766 4767 # Add field to add for update to concat fields 4768 sql_query_update_concat_fields.append( 4769 f""" 4770 CASE 4771 WHEN table_parquet."{header_column}" NOT IN ('','.') 4772 THEN concat( 4773 '{header_info_name}=', 4774 table_parquet."{header_column}", 4775 ';' 4776 ) 4777 4778 ELSE '' 4779 END 4780 """ 4781 ) 4782 4783 # Update query 4784 sql_query_update = f""" 4785 UPDATE {table_variants} as table_variants 4786 SET INFO = concat( 4787 CASE 4788 WHEN INFO NOT IN ('', '.') 4789 THEN INFO 4790 ELSE '' 4791 END, 4792 CASE 4793 WHEN table_variants.INFO NOT IN ('','.') 4794 THEN ';' 4795 ELSE '' 4796 END, 4797 ( 4798 SELECT 4799 concat( 4800 {",".join(sql_query_update_concat_fields)} 4801 ) 4802 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4803 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4804 AND table_parquet.\"START\" = table_variants.\"POS\" 4805 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4806 AND table_parquet.\"REF\" = table_variants.\"REF\" 4807 ) 4808 ) 4809 ; 4810 """ 4811 4812 # Update 4813 self.conn.execute(sql_query_update) 4814 4815 ### Annotate with VCF INFO field ### 4816 4817 # Init result VCF file 4818 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4819 4820 # If VCF exists 4821 if os.path.exists(output_results_vcf): 4822 4823 # Log 4824 log.debug("Exomiser result VCF update variants") 4825 4826 # Find Exomiser INFO field annotation in header 4827 with gzip.open(output_results_vcf, "rt") as f: 4828 header_list = self.read_vcf_header(f) 4829 exomiser_vcf_header = vcf.Reader( 4830 io.StringIO("\n".join(header_list)) 4831 ) 4832 4833 # Add annotation INFO field to header 4834 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4835 4836 # Update variants with VCF 4837 self.update_from_vcf(output_results_vcf) 4838 4839 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
4841 def annotation_snpeff(self, threads: int = None) -> None: 4842 """ 4843 This function annotate with snpEff 4844 4845 :param threads: The number of threads to use 4846 :return: the value of the variable "return_value". 4847 """ 4848 4849 # DEBUG 4850 log.debug("Start annotation with snpeff databases") 4851 4852 # Threads 4853 if not threads: 4854 threads = self.get_threads() 4855 log.debug("Threads: " + str(threads)) 4856 4857 # DEBUG 4858 delete_tmp = True 4859 if self.get_config().get("verbosity", "warning") in ["debug"]: 4860 delete_tmp = False 4861 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4862 4863 # Config 4864 config = self.get_config() 4865 log.debug("Config: " + str(config)) 4866 4867 # Config - Folders - Databases 4868 databases_folders = ( 4869 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4870 ) 4871 log.debug("Databases annotations: " + str(databases_folders)) 4872 4873 # # Config - Java 4874 # java_bin = get_bin( 4875 # tool="java", 4876 # bin="java", 4877 # bin_type="bin", 4878 # config=config, 4879 # default_folder="/usr/bin", 4880 # ) 4881 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4882 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4883 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4884 4885 # # Config - snpEff bin 4886 # snpeff_jar = get_bin( 4887 # tool="snpeff", 4888 # bin="snpEff.jar", 4889 # bin_type="jar", 4890 # config=config, 4891 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4892 # ) 4893 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4894 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4895 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4896 4897 # Config - snpEff bin command 4898 snpeff_bin_command = get_bin_command( 4899 bin="snpEff.jar", 4900 tool="snpeff", 4901 bin_type="jar", 4902 config=config, 4903 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4904 ) 4905 if not snpeff_bin_command: 4906 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4907 log.error(msg_err) 4908 raise ValueError(msg_err) 4909 4910 # Config - snpEff databases 4911 snpeff_databases = ( 4912 config.get("folders", {}) 4913 .get("databases", {}) 4914 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4915 ) 4916 snpeff_databases = full_path(snpeff_databases) 4917 if snpeff_databases is not None and snpeff_databases != "": 4918 log.debug(f"Create snpEff databases folder") 4919 if not os.path.exists(snpeff_databases): 4920 os.makedirs(snpeff_databases) 4921 4922 # Param 4923 param = self.get_param() 4924 log.debug("Param: " + str(param)) 4925 4926 # Param 4927 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4928 log.debug("Options: " + str(options)) 4929 4930 # Param - Assembly 4931 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4932 4933 # Param - Options 4934 snpeff_options = ( 4935 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4936 ) 4937 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4938 snpeff_csvstats = ( 4939 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4940 ) 4941 if snpeff_stats: 4942 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4943 snpeff_stats = full_path(snpeff_stats) 4944 snpeff_options += f" -stats {snpeff_stats}" 4945 if snpeff_csvstats: 4946 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4947 snpeff_csvstats = full_path(snpeff_csvstats) 4948 snpeff_options += f" -csvStats {snpeff_csvstats}" 4949 4950 # Data 4951 table_variants = self.get_table_variants() 4952 4953 # Check if not empty 4954 log.debug("Check if not empty") 4955 sql_query_chromosomes = ( 4956 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4957 ) 4958 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4959 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4960 log.info(f"VCF empty") 4961 return 4962 4963 # Export in VCF 4964 log.debug("Create initial file to annotate") 4965 tmp_vcf = NamedTemporaryFile( 4966 prefix=self.get_prefix(), 4967 dir=self.get_tmp_dir(), 4968 suffix=".vcf.gz", 4969 delete=True, 4970 ) 4971 tmp_vcf_name = tmp_vcf.name 4972 4973 # VCF header 4974 vcf_reader = self.get_header() 4975 log.debug("Initial header: " + str(vcf_reader.infos)) 4976 4977 # Existing annotations 4978 for vcf_annotation in self.get_header().infos: 4979 4980 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4981 log.debug( 4982 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4983 ) 4984 4985 # Memory limit 4986 # if config.get("memory", None): 4987 # memory_limit = config.get("memory", "8G") 4988 # else: 4989 # memory_limit = "8G" 4990 memory_limit = self.get_memory("8G") 4991 log.debug(f"memory_limit: {memory_limit}") 4992 4993 # snpEff java options 4994 snpeff_java_options = ( 4995 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4996 ) 4997 log.debug(f"Exomiser java options: {snpeff_java_options}") 4998 4999 force_update_annotation = True 5000 5001 if "ANN" not in self.get_header().infos or force_update_annotation: 5002 5003 # Check snpEff database 5004 log.debug(f"Check snpEff databases {[assembly]}") 5005 databases_download_snpeff( 5006 folder=snpeff_databases, assemblies=[assembly], config=config 5007 ) 5008 5009 # Export VCF file 5010 self.export_variant_vcf( 5011 vcf_file=tmp_vcf_name, 5012 remove_info=True, 5013 add_samples=False, 5014 index=True, 5015 ) 5016 5017 # Tmp file 5018 err_files = [] 5019 tmp_annotate_vcf = NamedTemporaryFile( 5020 prefix=self.get_prefix(), 5021 dir=self.get_tmp_dir(), 5022 suffix=".vcf", 5023 delete=False, 5024 ) 5025 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5026 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5027 err_files.append(tmp_annotate_vcf_name_err) 5028 5029 # Command 5030 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5031 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5032 run_parallel_commands([snpeff_command], 1) 5033 5034 # Error messages 5035 log.info(f"Error/Warning messages:") 5036 error_message_command_all = [] 5037 error_message_command_warning = [] 5038 error_message_command_err = [] 5039 for err_file in err_files: 5040 with open(err_file, "r") as f: 5041 for line in f: 5042 message = line.strip() 5043 error_message_command_all.append(message) 5044 if line.startswith("[W::"): 5045 error_message_command_warning.append(message) 5046 if line.startswith("[E::"): 5047 error_message_command_err.append(f"{err_file}: " + message) 5048 # log info 5049 for message in list( 5050 set(error_message_command_err + error_message_command_warning) 5051 ): 5052 log.info(f" {message}") 5053 # debug info 5054 for message in list(set(error_message_command_all)): 5055 log.debug(f" {message}") 5056 # failed 5057 if len(error_message_command_err): 5058 log.error("Annotation failed: Error in commands") 5059 raise ValueError("Annotation failed: Error in commands") 5060 5061 # Find annotation in header 5062 with open(tmp_annotate_vcf_name, "rt") as f: 5063 header_list = self.read_vcf_header(f) 5064 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5065 5066 for ann in annovar_vcf_header.infos: 5067 if ann not in self.get_header().infos: 5068 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5069 5070 # Update variants 5071 log.info(f"Annotation - Updating...") 5072 self.update_from_vcf(tmp_annotate_vcf_name) 5073 5074 else: 5075 if "ANN" in self.get_header().infos: 5076 log.debug(f"Existing snpEff annotations in VCF") 5077 if force_update_annotation: 5078 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
5080 def annotation_annovar(self, threads: int = None) -> None: 5081 """ 5082 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5083 annotations 5084 5085 :param threads: number of threads to use 5086 :return: the value of the variable "return_value". 5087 """ 5088 5089 # DEBUG 5090 log.debug("Start annotation with Annovar databases") 5091 5092 # Threads 5093 if not threads: 5094 threads = self.get_threads() 5095 log.debug("Threads: " + str(threads)) 5096 5097 # Tmp en Err files 5098 tmp_files = [] 5099 err_files = [] 5100 5101 # DEBUG 5102 delete_tmp = True 5103 if self.get_config().get("verbosity", "warning") in ["debug"]: 5104 delete_tmp = False 5105 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5106 5107 # Config 5108 config = self.get_config() 5109 log.debug("Config: " + str(config)) 5110 5111 # Config - Folders - Databases 5112 databases_folders = ( 5113 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5114 ) 5115 log.debug("Databases annotations: " + str(databases_folders)) 5116 5117 # Config - annovar bin command 5118 annovar_bin_command = get_bin_command( 5119 bin="table_annovar.pl", 5120 tool="annovar", 5121 bin_type="perl", 5122 config=config, 5123 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5124 ) 5125 if not annovar_bin_command: 5126 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5127 log.error(msg_err) 5128 raise ValueError(msg_err) 5129 5130 # Config - BCFTools bin command 5131 bcftools_bin_command = get_bin_command( 5132 bin="bcftools", 5133 tool="bcftools", 5134 bin_type="bin", 5135 config=config, 5136 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5137 ) 5138 if not bcftools_bin_command: 5139 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5140 log.error(msg_err) 5141 raise ValueError(msg_err) 5142 5143 # Config - annovar databases 5144 annovar_databases = ( 5145 config.get("folders", {}) 5146 .get("databases", {}) 5147 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5148 ) 5149 annovar_databases = full_path(annovar_databases) 5150 if annovar_databases != "" and not os.path.exists(annovar_databases): 5151 os.makedirs(annovar_databases) 5152 5153 # Param 5154 param = self.get_param() 5155 log.debug("Param: " + str(param)) 5156 5157 # Param - options 5158 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5159 log.debug("Options: " + str(options)) 5160 5161 # Param - annotations 5162 annotations = ( 5163 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5164 ) 5165 log.debug("Annotations: " + str(annotations)) 5166 5167 # Param - Assembly 5168 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5169 5170 # Annovar database assembly 5171 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5172 if annovar_databases_assembly != "" and not os.path.exists( 5173 annovar_databases_assembly 5174 ): 5175 os.makedirs(annovar_databases_assembly) 5176 5177 # Data 5178 table_variants = self.get_table_variants() 5179 5180 # Check if not empty 5181 log.debug("Check if not empty") 5182 sql_query_chromosomes = ( 5183 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5184 ) 5185 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5186 if not sql_query_chromosomes_df["count"][0]: 5187 log.info(f"VCF empty") 5188 return 5189 5190 # VCF header 5191 vcf_reader = self.get_header() 5192 log.debug("Initial header: " + str(vcf_reader.infos)) 5193 5194 # Existing annotations 5195 for vcf_annotation in self.get_header().infos: 5196 5197 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5198 log.debug( 5199 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5200 ) 5201 5202 force_update_annotation = True 5203 5204 if annotations: 5205 5206 commands = [] 5207 tmp_annotates_vcf_name_list = [] 5208 5209 # Export in VCF 5210 log.debug("Create initial file to annotate") 5211 tmp_vcf = NamedTemporaryFile( 5212 prefix=self.get_prefix(), 5213 dir=self.get_tmp_dir(), 5214 suffix=".vcf.gz", 5215 delete=False, 5216 ) 5217 tmp_vcf_name = tmp_vcf.name 5218 tmp_files.append(tmp_vcf_name) 5219 tmp_files.append(tmp_vcf_name + ".tbi") 5220 5221 # Export VCF file 5222 self.export_variant_vcf( 5223 vcf_file=tmp_vcf_name, 5224 remove_info=".", 5225 add_samples=False, 5226 index=True, 5227 ) 5228 5229 # Create file for field rename 5230 log.debug("Create file for field rename") 5231 tmp_rename = NamedTemporaryFile( 5232 prefix=self.get_prefix(), 5233 dir=self.get_tmp_dir(), 5234 suffix=".rename", 5235 delete=False, 5236 ) 5237 tmp_rename_name = tmp_rename.name 5238 tmp_files.append(tmp_rename_name) 5239 5240 # Check Annovar database 5241 log.debug( 5242 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5243 ) 5244 databases_download_annovar( 5245 folder=annovar_databases, 5246 files=list(annotations.keys()), 5247 assemblies=[assembly], 5248 ) 5249 5250 for annotation in annotations: 5251 annotation_fields = annotations[annotation] 5252 5253 if not annotation_fields: 5254 annotation_fields = {"INFO": None} 5255 5256 log.info(f"Annotations Annovar - database '{annotation}'") 5257 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5258 5259 # Tmp file for annovar 5260 err_files = [] 5261 tmp_annotate_vcf_directory = TemporaryDirectory( 5262 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5263 ) 5264 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5265 tmp_annotate_vcf_name_annovar = ( 5266 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5267 ) 5268 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5269 err_files.append(tmp_annotate_vcf_name_err) 5270 tmp_files.append(tmp_annotate_vcf_name_err) 5271 5272 # Tmp file final vcf annotated by annovar 5273 tmp_annotate_vcf = NamedTemporaryFile( 5274 prefix=self.get_prefix(), 5275 dir=self.get_tmp_dir(), 5276 suffix=".vcf.gz", 5277 delete=False, 5278 ) 5279 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5280 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5281 tmp_files.append(tmp_annotate_vcf_name) 5282 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5283 5284 # Number of fields 5285 annotation_list = [] 5286 annotation_renamed_list = [] 5287 5288 for annotation_field in annotation_fields: 5289 5290 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5291 annotation_fields_new_name = annotation_fields.get( 5292 annotation_field, annotation_field 5293 ) 5294 if not annotation_fields_new_name: 5295 annotation_fields_new_name = annotation_field 5296 5297 if ( 5298 force_update_annotation 5299 or annotation_fields_new_name not in self.get_header().infos 5300 ): 5301 annotation_list.append(annotation_field) 5302 annotation_renamed_list.append(annotation_fields_new_name) 5303 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5304 log.warning( 5305 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5306 ) 5307 5308 # Add rename info 5309 run_parallel_commands( 5310 [ 5311 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5312 ], 5313 1, 5314 ) 5315 5316 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5317 log.debug("annotation_list: " + str(annotation_list)) 5318 5319 # protocol 5320 protocol = annotation 5321 5322 # argument 5323 argument = "" 5324 5325 # operation 5326 operation = "f" 5327 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5328 "ensGene" 5329 ): 5330 operation = "g" 5331 if options.get("genebase", None): 5332 argument = f"""'{options.get("genebase","")}'""" 5333 elif annotation in ["cytoBand"]: 5334 operation = "r" 5335 5336 # argument option 5337 argument_option = "" 5338 if argument != "": 5339 argument_option = " --argument " + argument 5340 5341 # command options 5342 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5343 for option in options: 5344 if option not in ["genebase"]: 5345 command_options += f""" --{option}={options[option]}""" 5346 5347 # Command 5348 5349 # Command - Annovar 5350 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5351 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5352 5353 # Command - start pipe 5354 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5355 5356 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5357 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5358 5359 # Command - Special characters (refGene annotation) 5360 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5361 5362 # Command - Clean empty fields (with value ".") 5363 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5364 5365 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5366 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5367 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5368 # for ann in annotation_renamed_list: 5369 for ann in annotation_list: 5370 annovar_fields_to_keep.append(f"^INFO/{ann}") 5371 5372 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5373 5374 # Command - indexing 5375 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5376 5377 log.debug(f"Annotation - Annovar command: {command_annovar}") 5378 run_parallel_commands([command_annovar], 1) 5379 5380 # Error messages 5381 log.info(f"Error/Warning messages:") 5382 error_message_command_all = [] 5383 error_message_command_warning = [] 5384 error_message_command_err = [] 5385 for err_file in err_files: 5386 with open(err_file, "r") as f: 5387 for line in f: 5388 message = line.strip() 5389 error_message_command_all.append(message) 5390 if line.startswith("[W::") or line.startswith("WARNING"): 5391 error_message_command_warning.append(message) 5392 if line.startswith("[E::") or line.startswith("ERROR"): 5393 error_message_command_err.append( 5394 f"{err_file}: " + message 5395 ) 5396 # log info 5397 for message in list( 5398 set(error_message_command_err + error_message_command_warning) 5399 ): 5400 log.info(f" {message}") 5401 # debug info 5402 for message in list(set(error_message_command_all)): 5403 log.debug(f" {message}") 5404 # failed 5405 if len(error_message_command_err): 5406 log.error("Annotation failed: Error in commands") 5407 raise ValueError("Annotation failed: Error in commands") 5408 5409 if tmp_annotates_vcf_name_list: 5410 5411 # List of annotated files 5412 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5413 5414 # Tmp file 5415 tmp_annotate_vcf = NamedTemporaryFile( 5416 prefix=self.get_prefix(), 5417 dir=self.get_tmp_dir(), 5418 suffix=".vcf.gz", 5419 delete=False, 5420 ) 5421 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5422 tmp_files.append(tmp_annotate_vcf_name) 5423 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5424 err_files.append(tmp_annotate_vcf_name_err) 5425 tmp_files.append(tmp_annotate_vcf_name_err) 5426 5427 # Command merge 5428 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5429 log.info( 5430 f"Annotation Annovar - Annotation merging " 5431 + str(len(tmp_annotates_vcf_name_list)) 5432 + " annotated files" 5433 ) 5434 log.debug(f"Annotation - merge command: {merge_command}") 5435 run_parallel_commands([merge_command], 1) 5436 5437 # Find annotation in header 5438 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5439 header_list = self.read_vcf_header(f) 5440 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5441 5442 for ann in annovar_vcf_header.infos: 5443 if ann not in self.get_header().infos: 5444 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5445 5446 # Update variants 5447 log.info(f"Annotation Annovar - Updating...") 5448 self.update_from_vcf(tmp_annotate_vcf_name) 5449 5450 # Clean files 5451 # Tmp file remove command 5452 if True: 5453 tmp_files_remove_command = "" 5454 if tmp_files: 5455 tmp_files_remove_command = " ".join(tmp_files) 5456 clean_command = f" rm -f {tmp_files_remove_command} " 5457 log.debug(f"Annotation Annovar - Annotation cleaning ") 5458 log.debug(f"Annotation - cleaning command: {clean_command}") 5459 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5462 def annotation_parquet(self, threads: int = None) -> None: 5463 """ 5464 It takes a VCF file, and annotates it with a parquet file 5465 5466 :param threads: number of threads to use for the annotation 5467 :return: the value of the variable "result". 5468 """ 5469 5470 # DEBUG 5471 log.debug("Start annotation with parquet databases") 5472 5473 # Threads 5474 if not threads: 5475 threads = self.get_threads() 5476 log.debug("Threads: " + str(threads)) 5477 5478 # DEBUG 5479 delete_tmp = True 5480 if self.get_config().get("verbosity", "warning") in ["debug"]: 5481 delete_tmp = False 5482 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5483 5484 # Config 5485 databases_folders = set( 5486 self.get_config() 5487 .get("folders", {}) 5488 .get("databases", {}) 5489 .get("annotations", ["."]) 5490 + self.get_config() 5491 .get("folders", {}) 5492 .get("databases", {}) 5493 .get("parquet", ["."]) 5494 ) 5495 log.debug("Databases annotations: " + str(databases_folders)) 5496 5497 # Param 5498 annotations = ( 5499 self.get_param() 5500 .get("annotation", {}) 5501 .get("parquet", {}) 5502 .get("annotations", None) 5503 ) 5504 log.debug("Annotations: " + str(annotations)) 5505 5506 # Assembly 5507 assembly = self.get_param().get( 5508 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5509 ) 5510 5511 # Force Update Annotation 5512 force_update_annotation = ( 5513 self.get_param() 5514 .get("annotation", {}) 5515 .get("options", {}) 5516 .get("annotations_update", False) 5517 ) 5518 log.debug(f"force_update_annotation={force_update_annotation}") 5519 force_append_annotation = ( 5520 self.get_param() 5521 .get("annotation", {}) 5522 .get("options", {}) 5523 .get("annotations_append", False) 5524 ) 5525 log.debug(f"force_append_annotation={force_append_annotation}") 5526 5527 # Data 5528 table_variants = self.get_table_variants() 5529 5530 # Check if not empty 5531 log.debug("Check if not empty") 5532 sql_query_chromosomes_df = self.get_query_to_df( 5533 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5534 ) 5535 if not sql_query_chromosomes_df["count"][0]: 5536 log.info(f"VCF empty") 5537 return 5538 5539 # VCF header 5540 vcf_reader = self.get_header() 5541 log.debug("Initial header: " + str(vcf_reader.infos)) 5542 5543 # Nb Variants POS 5544 log.debug("NB Variants Start") 5545 nb_variants = self.conn.execute( 5546 f"SELECT count(*) AS count FROM variants" 5547 ).fetchdf()["count"][0] 5548 log.debug("NB Variants Stop") 5549 5550 # Existing annotations 5551 for vcf_annotation in self.get_header().infos: 5552 5553 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5554 log.debug( 5555 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5556 ) 5557 5558 # Added columns 5559 added_columns = [] 5560 5561 # drop indexes 5562 log.debug(f"Drop indexes...") 5563 self.drop_indexes() 5564 5565 if annotations: 5566 5567 if "ALL" in annotations: 5568 5569 all_param = annotations.get("ALL", {}) 5570 all_param_formats = all_param.get("formats", None) 5571 all_param_releases = all_param.get("releases", None) 5572 5573 databases_infos_dict = self.scan_databases( 5574 database_formats=all_param_formats, 5575 database_releases=all_param_releases, 5576 ) 5577 for database_infos in databases_infos_dict.keys(): 5578 if database_infos not in annotations: 5579 annotations[database_infos] = {"INFO": None} 5580 5581 for annotation in annotations: 5582 5583 if annotation in ["ALL"]: 5584 continue 5585 5586 # Annotation Name 5587 annotation_name = os.path.basename(annotation) 5588 5589 # Annotation fields 5590 annotation_fields = annotations[annotation] 5591 if not annotation_fields: 5592 annotation_fields = {"INFO": None} 5593 5594 log.debug(f"Annotation '{annotation_name}'") 5595 log.debug( 5596 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5597 ) 5598 5599 # Create Database 5600 database = Database( 5601 database=annotation, 5602 databases_folders=databases_folders, 5603 assembly=assembly, 5604 ) 5605 5606 # Find files 5607 parquet_file = database.get_database() 5608 parquet_hdr_file = database.get_header_file() 5609 parquet_type = database.get_type() 5610 5611 # Check if files exists 5612 if not parquet_file or not parquet_hdr_file: 5613 log.error("Annotation failed: file not found") 5614 raise ValueError("Annotation failed: file not found") 5615 else: 5616 # Get parquet connexion 5617 parquet_sql_attach = database.get_sql_database_attach( 5618 output="query" 5619 ) 5620 if parquet_sql_attach: 5621 self.conn.execute(parquet_sql_attach) 5622 parquet_file_link = database.get_sql_database_link() 5623 # Log 5624 log.debug( 5625 f"Annotation '{annotation_name}' - file: " 5626 + str(parquet_file) 5627 + " and " 5628 + str(parquet_hdr_file) 5629 ) 5630 5631 # Database full header columns 5632 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5633 parquet_hdr_file 5634 ) 5635 # Log 5636 log.debug( 5637 "Annotation database header columns : " 5638 + str(parquet_hdr_vcf_header_columns) 5639 ) 5640 5641 # Load header as VCF object 5642 parquet_hdr_vcf_header_infos = database.get_header().infos 5643 # Log 5644 log.debug( 5645 "Annotation database header: " 5646 + str(parquet_hdr_vcf_header_infos) 5647 ) 5648 5649 # Get extra infos 5650 parquet_columns = database.get_extra_columns() 5651 # Log 5652 log.debug("Annotation database Columns: " + str(parquet_columns)) 5653 5654 # Add extra columns if "ALL" in annotation_fields 5655 # if "ALL" in annotation_fields: 5656 # allow_add_extra_column = True 5657 if "ALL" in annotation_fields and database.get_extra_columns(): 5658 for extra_column in database.get_extra_columns(): 5659 if ( 5660 extra_column not in annotation_fields 5661 and extra_column.replace("INFO/", "") 5662 not in parquet_hdr_vcf_header_infos 5663 ): 5664 parquet_hdr_vcf_header_infos[extra_column] = ( 5665 vcf.parser._Info( 5666 extra_column, 5667 ".", 5668 "String", 5669 f"{extra_column} description", 5670 "unknown", 5671 "unknown", 5672 self.code_type_map["String"], 5673 ) 5674 ) 5675 5676 # For all fields in database 5677 annotation_fields_all = False 5678 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5679 annotation_fields_all = True 5680 annotation_fields = { 5681 key: key for key in parquet_hdr_vcf_header_infos 5682 } 5683 5684 log.debug( 5685 "Annotation database header - All annotations added: " 5686 + str(annotation_fields) 5687 ) 5688 5689 # Init 5690 5691 # List of annotation fields to use 5692 sql_query_annotation_update_info_sets = [] 5693 5694 # List of annotation to agregate 5695 sql_query_annotation_to_agregate = [] 5696 5697 # Number of fields 5698 nb_annotation_field = 0 5699 5700 # Annotation fields processed 5701 annotation_fields_processed = [] 5702 5703 # Columns mapping 5704 map_columns = database.map_columns( 5705 columns=annotation_fields, prefixes=["INFO/"] 5706 ) 5707 5708 # Query dict for fields to remove (update option) 5709 query_dict_remove = {} 5710 5711 # Fetch Anotation fields 5712 for annotation_field in annotation_fields: 5713 5714 # annotation_field_column 5715 annotation_field_column = map_columns.get( 5716 annotation_field, "INFO" 5717 ) 5718 5719 # field new name, if parametered 5720 annotation_fields_new_name = annotation_fields.get( 5721 annotation_field, annotation_field 5722 ) 5723 if not annotation_fields_new_name: 5724 annotation_fields_new_name = annotation_field 5725 5726 # To annotate 5727 # force_update_annotation = True 5728 # force_append_annotation = True 5729 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5730 if annotation_field in parquet_hdr_vcf_header_infos and ( 5731 force_update_annotation 5732 or force_append_annotation 5733 or ( 5734 annotation_fields_new_name 5735 not in self.get_header().infos 5736 ) 5737 ): 5738 5739 # Add field to annotation to process list 5740 annotation_fields_processed.append( 5741 annotation_fields_new_name 5742 ) 5743 5744 # explode infos for the field 5745 annotation_fields_new_name_info_msg = "" 5746 if ( 5747 force_update_annotation 5748 and annotation_fields_new_name 5749 in self.get_header().infos 5750 ): 5751 # Remove field from INFO 5752 query = f""" 5753 UPDATE {table_variants} as table_variants 5754 SET INFO = REGEXP_REPLACE( 5755 concat(table_variants.INFO,''), 5756 ';*{annotation_fields_new_name}=[^;]*', 5757 '' 5758 ) 5759 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5760 """ 5761 annotation_fields_new_name_info_msg = " [update]" 5762 query_dict_remove[ 5763 f"remove 'INFO/{annotation_fields_new_name}'" 5764 ] = query 5765 5766 # Sep between fields in INFO 5767 nb_annotation_field += 1 5768 if nb_annotation_field > 1: 5769 annotation_field_sep = ";" 5770 else: 5771 annotation_field_sep = "" 5772 5773 log.info( 5774 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5775 ) 5776 5777 # Add INFO field to header 5778 parquet_hdr_vcf_header_infos_number = ( 5779 parquet_hdr_vcf_header_infos[annotation_field].num 5780 or "." 5781 ) 5782 parquet_hdr_vcf_header_infos_type = ( 5783 parquet_hdr_vcf_header_infos[annotation_field].type 5784 or "String" 5785 ) 5786 parquet_hdr_vcf_header_infos_description = ( 5787 parquet_hdr_vcf_header_infos[annotation_field].desc 5788 or f"{annotation_field} description" 5789 ) 5790 parquet_hdr_vcf_header_infos_source = ( 5791 parquet_hdr_vcf_header_infos[annotation_field].source 5792 or "unknown" 5793 ) 5794 parquet_hdr_vcf_header_infos_version = ( 5795 parquet_hdr_vcf_header_infos[annotation_field].version 5796 or "unknown" 5797 ) 5798 5799 vcf_reader.infos[annotation_fields_new_name] = ( 5800 vcf.parser._Info( 5801 annotation_fields_new_name, 5802 parquet_hdr_vcf_header_infos_number, 5803 parquet_hdr_vcf_header_infos_type, 5804 parquet_hdr_vcf_header_infos_description, 5805 parquet_hdr_vcf_header_infos_source, 5806 parquet_hdr_vcf_header_infos_version, 5807 self.code_type_map[ 5808 parquet_hdr_vcf_header_infos_type 5809 ], 5810 ) 5811 ) 5812 5813 # Append 5814 if force_append_annotation: 5815 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5816 else: 5817 query_case_when_append = "" 5818 5819 # Annotation/Update query fields 5820 # Found in INFO column 5821 if ( 5822 annotation_field_column == "INFO" 5823 and "INFO" in parquet_hdr_vcf_header_columns 5824 ): 5825 sql_query_annotation_update_info_sets.append( 5826 f""" 5827 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5828 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5829 ELSE '' 5830 END 5831 """ 5832 ) 5833 # Found in a specific column 5834 else: 5835 sql_query_annotation_update_info_sets.append( 5836 f""" 5837 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 5838 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 5839 ELSE '' 5840 END 5841 """ 5842 ) 5843 sql_query_annotation_to_agregate.append( 5844 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5845 ) 5846 5847 # Not to annotate 5848 else: 5849 5850 if force_update_annotation: 5851 annotation_message = "forced" 5852 else: 5853 annotation_message = "skipped" 5854 5855 if annotation_field not in parquet_hdr_vcf_header_infos: 5856 log.warning( 5857 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5858 ) 5859 if annotation_fields_new_name in self.get_header().infos: 5860 log.warning( 5861 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5862 ) 5863 5864 # Check if ALL fields have to be annotated. Thus concat all INFO field 5865 # allow_annotation_full_info = True 5866 allow_annotation_full_info = not force_append_annotation 5867 5868 if parquet_type in ["regions"]: 5869 allow_annotation_full_info = False 5870 5871 if ( 5872 allow_annotation_full_info 5873 and nb_annotation_field == len(annotation_fields) 5874 and annotation_fields_all 5875 and ( 5876 "INFO" in parquet_hdr_vcf_header_columns 5877 and "INFO" in database.get_extra_columns() 5878 ) 5879 ): 5880 log.debug("Column INFO annotation enabled") 5881 sql_query_annotation_update_info_sets = [] 5882 sql_query_annotation_update_info_sets.append( 5883 f" table_parquet.INFO " 5884 ) 5885 5886 if sql_query_annotation_update_info_sets: 5887 5888 # Annotate 5889 log.info(f"Annotation '{annotation_name}' - Annotation...") 5890 5891 # Join query annotation update info sets for SQL 5892 sql_query_annotation_update_info_sets_sql = ",".join( 5893 sql_query_annotation_update_info_sets 5894 ) 5895 5896 # Check chromosomes list (and variants infos) 5897 sql_query_chromosomes = f""" 5898 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5899 FROM {table_variants} as table_variants 5900 GROUP BY table_variants."#CHROM" 5901 ORDER BY table_variants."#CHROM" 5902 """ 5903 sql_query_chromosomes_df = self.conn.execute( 5904 sql_query_chromosomes 5905 ).df() 5906 sql_query_chromosomes_dict = { 5907 entry["CHROM"]: { 5908 "count": entry["count_variants"], 5909 "min": entry["min_variants"], 5910 "max": entry["max_variants"], 5911 } 5912 for index, entry in sql_query_chromosomes_df.iterrows() 5913 } 5914 5915 # Init 5916 nb_of_query = 0 5917 nb_of_variant_annotated = 0 5918 query_dict = query_dict_remove 5919 5920 # for chrom in sql_query_chromosomes_df["CHROM"]: 5921 for chrom in sql_query_chromosomes_dict: 5922 5923 # Number of variant by chromosome 5924 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5925 chrom, {} 5926 ).get("count", 0) 5927 5928 log.debug( 5929 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5930 ) 5931 5932 # Annotation with regions database 5933 if parquet_type in ["regions"]: 5934 sql_query_annotation_from_clause = f""" 5935 FROM ( 5936 SELECT 5937 '{chrom}' AS \"#CHROM\", 5938 table_variants_from.\"POS\" AS \"POS\", 5939 {",".join(sql_query_annotation_to_agregate)} 5940 FROM {table_variants} as table_variants_from 5941 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5942 table_parquet_from."#CHROM" = '{chrom}' 5943 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5944 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5945 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5946 ) 5947 ) 5948 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5949 GROUP BY table_variants_from.\"POS\" 5950 ) 5951 as table_parquet 5952 """ 5953 5954 sql_query_annotation_where_clause = """ 5955 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5956 AND table_parquet.\"POS\" = table_variants.\"POS\" 5957 """ 5958 5959 # Annotation with variants database 5960 else: 5961 sql_query_annotation_from_clause = f""" 5962 FROM {parquet_file_link} as table_parquet 5963 """ 5964 sql_query_annotation_where_clause = f""" 5965 table_variants."#CHROM" = '{chrom}' 5966 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5967 AND table_parquet.\"POS\" = table_variants.\"POS\" 5968 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5969 AND table_parquet.\"REF\" = table_variants.\"REF\" 5970 """ 5971 5972 # Create update query 5973 sql_query_annotation_chrom_interval_pos = f""" 5974 UPDATE {table_variants} as table_variants 5975 SET INFO = 5976 concat( 5977 CASE WHEN table_variants.INFO NOT IN ('','.') 5978 THEN table_variants.INFO 5979 ELSE '' 5980 END 5981 , 5982 CASE WHEN table_variants.INFO NOT IN ('','.') 5983 AND ( 5984 concat({sql_query_annotation_update_info_sets_sql}) 5985 ) 5986 NOT IN ('','.') 5987 THEN ';' 5988 ELSE '' 5989 END 5990 , 5991 {sql_query_annotation_update_info_sets_sql} 5992 ) 5993 {sql_query_annotation_from_clause} 5994 WHERE {sql_query_annotation_where_clause} 5995 ; 5996 """ 5997 5998 # Add update query to dict 5999 query_dict[ 6000 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6001 ] = sql_query_annotation_chrom_interval_pos 6002 6003 nb_of_query = len(query_dict) 6004 num_query = 0 6005 6006 # SET max_expression_depth TO x 6007 self.conn.execute("SET max_expression_depth TO 10000") 6008 6009 for query_name in query_dict: 6010 query = query_dict[query_name] 6011 num_query += 1 6012 log.info( 6013 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6014 ) 6015 result = self.conn.execute(query) 6016 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6017 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6018 log.info( 6019 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6020 ) 6021 6022 log.info( 6023 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6024 ) 6025 6026 else: 6027 6028 log.info( 6029 f"Annotation '{annotation_name}' - No Annotations available" 6030 ) 6031 6032 log.debug("Final header: " + str(vcf_reader.infos)) 6033 6034 # Remove added columns 6035 for added_column in added_columns: 6036 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
6038 def annotation_splice(self, threads: int = None) -> None: 6039 """ 6040 This function annotate with snpEff 6041 6042 :param threads: The number of threads to use 6043 :return: the value of the variable "return_value". 6044 """ 6045 6046 # DEBUG 6047 log.debug("Start annotation with splice tools") 6048 6049 # Threads 6050 if not threads: 6051 threads = self.get_threads() 6052 log.debug("Threads: " + str(threads)) 6053 6054 # DEBUG 6055 delete_tmp = True 6056 if self.get_config().get("verbosity", "warning") in ["debug"]: 6057 delete_tmp = False 6058 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6059 6060 # Config 6061 config = self.get_config() 6062 log.debug("Config: " + str(config)) 6063 splice_config = config.get("tools", {}).get("splice", {}) 6064 if not splice_config: 6065 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6066 if not splice_config: 6067 msg_err = "No Splice tool config" 6068 log.error(msg_err) 6069 raise ValueError(msg_err) 6070 log.debug(f"splice_config={splice_config}") 6071 6072 # Config - Folders - Databases 6073 databases_folders = ( 6074 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6075 ) 6076 log.debug("Databases annotations: " + str(databases_folders)) 6077 6078 # Splice docker image 6079 splice_docker_image = splice_config.get("docker").get("image") 6080 6081 # Pull splice image if it's not already there 6082 if not check_docker_image_exists(splice_docker_image): 6083 log.warning( 6084 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6085 ) 6086 try: 6087 command(f"docker pull {splice_config.get('docker').get('image')}") 6088 except subprocess.CalledProcessError: 6089 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6090 log.error(msg_err) 6091 raise ValueError(msg_err) 6092 return None 6093 6094 # Config - splice databases 6095 splice_databases = ( 6096 config.get("folders", {}) 6097 .get("databases", {}) 6098 .get("splice", DEFAULT_SPLICE_FOLDER) 6099 ) 6100 splice_databases = full_path(splice_databases) 6101 6102 # Param 6103 param = self.get_param() 6104 log.debug("Param: " + str(param)) 6105 6106 # Param 6107 options = param.get("annotation", {}).get("splice", {}) 6108 log.debug("Options: " + str(options)) 6109 6110 # Data 6111 table_variants = self.get_table_variants() 6112 6113 # Check if not empty 6114 log.debug("Check if not empty") 6115 sql_query_chromosomes = ( 6116 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6117 ) 6118 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6119 log.info("VCF empty") 6120 return None 6121 6122 # Export in VCF 6123 log.debug("Create initial file to annotate") 6124 6125 # Create output folder 6126 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6127 if not os.path.exists(output_folder): 6128 Path(output_folder).mkdir(parents=True, exist_ok=True) 6129 6130 # Create tmp VCF file 6131 tmp_vcf = NamedTemporaryFile( 6132 prefix=self.get_prefix(), 6133 dir=output_folder, 6134 suffix=".vcf", 6135 delete=False, 6136 ) 6137 tmp_vcf_name = tmp_vcf.name 6138 6139 # VCF header 6140 header = self.get_header() 6141 6142 # Existing annotations 6143 for vcf_annotation in self.get_header().infos: 6144 6145 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6146 log.debug( 6147 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6148 ) 6149 6150 # Memory limit 6151 if config.get("memory", None): 6152 memory_limit = config.get("memory", "8G").upper() 6153 # upper() 6154 else: 6155 memory_limit = "8G" 6156 log.debug(f"memory_limit: {memory_limit}") 6157 6158 # Check number of variants to annotate 6159 where_clause_regex_spliceai = r"SpliceAI_\w+" 6160 where_clause_regex_spip = r"SPiP_\w+" 6161 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6162 df_list_of_variants_to_annotate = self.get_query_to_df( 6163 query=f""" SELECT * FROM variants {where_clause} """ 6164 ) 6165 if len(df_list_of_variants_to_annotate) == 0: 6166 log.warning( 6167 f"No variants to annotate with splice. Variants probably already annotated with splice" 6168 ) 6169 return None 6170 else: 6171 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6172 6173 # Export VCF file 6174 self.export_variant_vcf( 6175 vcf_file=tmp_vcf_name, 6176 remove_info=True, 6177 add_samples=True, 6178 index=False, 6179 where_clause=where_clause, 6180 ) 6181 6182 # Create docker container and launch splice analysis 6183 if splice_config: 6184 6185 # Splice mount folders 6186 mount_folders = splice_config.get("mount", {}) 6187 6188 # Genome mount 6189 mount_folders[ 6190 config.get("folders", {}) 6191 .get("databases", {}) 6192 .get("genomes", DEFAULT_GENOME_FOLDER) 6193 ] = "ro" 6194 6195 # SpliceAI mount 6196 mount_folders[ 6197 config.get("folders", {}) 6198 .get("databases", {}) 6199 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6200 ] = "ro" 6201 6202 # Genome mount 6203 mount_folders[ 6204 config.get("folders", {}) 6205 .get("databases", {}) 6206 .get("spip", DEFAULT_SPIP_FOLDER) 6207 ] = "ro" 6208 6209 # Mount folders 6210 mount = [] 6211 6212 # Config mount 6213 mount = [ 6214 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6215 for path, mode in mount_folders.items() 6216 ] 6217 6218 if any(value for value in splice_config.values() if value is None): 6219 log.warning("At least one splice config parameter is empty") 6220 return None 6221 6222 # Params in splice nf 6223 def check_values(dico: dict): 6224 """ 6225 Ensure parameters for NF splice pipeline 6226 """ 6227 for key, val in dico.items(): 6228 if key == "genome": 6229 if any( 6230 assemb in options.get("genome", {}) 6231 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6232 ): 6233 yield f"--{key} hg19" 6234 elif any( 6235 assemb in options.get("genome", {}) 6236 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6237 ): 6238 yield f"--{key} hg38" 6239 elif ( 6240 (isinstance(val, str) and val) 6241 or isinstance(val, int) 6242 or isinstance(val, bool) 6243 ): 6244 yield f"--{key} {val}" 6245 6246 # Genome 6247 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6248 options["genome"] = genome 6249 6250 # NF params 6251 nf_params = [] 6252 6253 # Add options 6254 if options: 6255 nf_params = list(check_values(options)) 6256 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6257 else: 6258 log.debug("No NF params provided") 6259 6260 # Add threads 6261 if "threads" not in options.keys(): 6262 nf_params.append(f"--threads {threads}") 6263 6264 # Genome path 6265 genome_path = find_genome( 6266 config.get("folders", {}) 6267 .get("databases", {}) 6268 .get("genomes", DEFAULT_GENOME_FOLDER), 6269 file=f"{genome}.fa", 6270 ) 6271 # Add genome path 6272 if not genome_path: 6273 raise ValueError( 6274 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6275 ) 6276 else: 6277 log.debug(f"Genome: {genome_path}") 6278 nf_params.append(f"--genome_path {genome_path}") 6279 6280 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6281 """ 6282 Setting up updated databases for SPiP and SpliceAI 6283 """ 6284 6285 try: 6286 6287 # SpliceAI assembly transcriptome 6288 spliceai_assembly = os.path.join( 6289 config.get("folders", {}) 6290 .get("databases", {}) 6291 .get("spliceai", {}), 6292 options.get("genome"), 6293 "transcriptome", 6294 ) 6295 spip_assembly = options.get("genome") 6296 6297 spip = find( 6298 f"transcriptome_{spip_assembly}.RData", 6299 config.get("folders", {}).get("databases", {}).get("spip", {}), 6300 ) 6301 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6302 log.debug(f"SPiP annotations: {spip}") 6303 log.debug(f"SpliceAI annotations: {spliceai}") 6304 if spip and spliceai: 6305 return [ 6306 f"--spip_transcriptome {spip}", 6307 f"--spliceai_annotations {spliceai}", 6308 ] 6309 else: 6310 # TODO crash and go on with basic annotations ? 6311 # raise ValueError( 6312 # "Can't find splice databases in configuration EXIT" 6313 # ) 6314 log.warning( 6315 "Can't find splice databases in configuration, use annotations file from image" 6316 ) 6317 except TypeError: 6318 log.warning( 6319 "Can't find splice databases in configuration, use annotations file from image" 6320 ) 6321 return [] 6322 6323 # Add options, check if transcriptome option have already beend provided 6324 if ( 6325 "spip_transcriptome" not in nf_params 6326 and "spliceai_transcriptome" not in nf_params 6327 ): 6328 splice_reference = splice_annotations(options, config) 6329 if splice_reference: 6330 nf_params.extend(splice_reference) 6331 6332 nf_params.append(f"--output_folder {output_folder}") 6333 6334 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6335 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6336 log.debug(cmd) 6337 6338 splice_config["docker"]["command"] = cmd 6339 6340 docker_cmd = get_bin_command( 6341 tool="splice", 6342 bin_type="docker", 6343 config=config, 6344 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6345 add_options=f"--name {random_uuid} {' '.join(mount)}", 6346 ) 6347 6348 # Docker debug 6349 # if splice_config.get("rm_container"): 6350 # rm_container = "--rm" 6351 # else: 6352 # rm_container = "" 6353 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6354 6355 log.debug(docker_cmd) 6356 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6357 log.debug(res.stdout) 6358 if res.stderr: 6359 log.error(res.stderr) 6360 res.check_returncode() 6361 else: 6362 log.warning(f"Splice tool configuration not found: {config}") 6363 6364 # Update variants 6365 log.info("Annotation - Updating...") 6366 # Test find output vcf 6367 log.debug( 6368 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6369 ) 6370 output_vcf = [] 6371 # Wrong folder to look in 6372 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6373 if ( 6374 files 6375 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6376 ): 6377 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6378 # log.debug(os.listdir(options.get("output_folder"))) 6379 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6380 if not output_vcf: 6381 log.debug( 6382 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6383 ) 6384 else: 6385 # Get new header from annotated vcf 6386 log.debug(f"Initial header: {len(header.infos)} fields") 6387 # Create new header with splice infos 6388 new_vcf = Variants(input=output_vcf[0]) 6389 new_vcf_header = new_vcf.get_header().infos 6390 for keys, infos in new_vcf_header.items(): 6391 if keys not in header.infos.keys(): 6392 header.infos[keys] = infos 6393 log.debug(f"New header: {len(header.infos)} fields") 6394 log.debug(f"Splice tmp output: {output_vcf[0]}") 6395 self.update_from_vcf(output_vcf[0]) 6396 6397 # Remove folder 6398 remove_if_exists(output_folder)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6404 def get_config_default(self, name: str) -> dict: 6405 """ 6406 The function `get_config_default` returns a dictionary containing default configurations for 6407 various calculations and prioritizations. 6408 6409 :param name: The `get_config_default` function returns a dictionary containing default 6410 configurations for different calculations and prioritizations. The `name` parameter is used to 6411 specify which specific configuration to retrieve from the dictionary 6412 :type name: str 6413 :return: The function `get_config_default` returns a dictionary containing default configuration 6414 settings for different calculations and prioritizations. The specific configuration settings are 6415 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6416 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6417 returned. If there is no match, an empty dictionary is returned. 6418 """ 6419 6420 config_default = { 6421 "calculations": { 6422 "variant_chr_pos_alt_ref": { 6423 "type": "sql", 6424 "name": "variant_chr_pos_alt_ref", 6425 "description": "Create a variant ID with chromosome, position, alt and ref", 6426 "available": False, 6427 "output_column_name": "variant_chr_pos_alt_ref", 6428 "output_column_type": "String", 6429 "output_column_description": "variant ID with chromosome, position, alt and ref", 6430 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6431 "operation_info": True, 6432 }, 6433 "VARTYPE": { 6434 "type": "sql", 6435 "name": "VARTYPE", 6436 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6437 "available": True, 6438 "output_column_name": "VARTYPE", 6439 "output_column_type": "String", 6440 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6441 "operation_query": """ 6442 CASE 6443 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6444 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6445 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6446 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6447 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6448 ELSE 'UNDEFINED' 6449 END 6450 """, 6451 "info_fields": ["SVTYPE"], 6452 "operation_info": True, 6453 }, 6454 "snpeff_hgvs": { 6455 "type": "python", 6456 "name": "snpeff_hgvs", 6457 "description": "HGVS nomenclatures from snpEff annotation", 6458 "available": True, 6459 "function_name": "calculation_extract_snpeff_hgvs", 6460 "function_params": ["snpeff_hgvs", "ANN"], 6461 }, 6462 "snpeff_ann_explode": { 6463 "type": "python", 6464 "name": "snpeff_ann_explode", 6465 "description": "Explode snpEff annotations with uniquify values", 6466 "available": True, 6467 "function_name": "calculation_snpeff_ann_explode", 6468 "function_params": [False, "fields", "snpeff_", "ANN"], 6469 }, 6470 "snpeff_ann_explode_uniquify": { 6471 "type": "python", 6472 "name": "snpeff_ann_explode_uniquify", 6473 "description": "Explode snpEff annotations", 6474 "available": True, 6475 "function_name": "calculation_snpeff_ann_explode", 6476 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6477 }, 6478 "snpeff_ann_explode_json": { 6479 "type": "python", 6480 "name": "snpeff_ann_explode_json", 6481 "description": "Explode snpEff annotations in JSON format", 6482 "available": True, 6483 "function_name": "calculation_snpeff_ann_explode", 6484 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6485 }, 6486 "NOMEN": { 6487 "type": "python", 6488 "name": "NOMEN", 6489 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6490 "available": True, 6491 "function_name": "calculation_extract_nomen", 6492 "function_params": [], 6493 }, 6494 "FINDBYPIPELINE": { 6495 "type": "python", 6496 "name": "FINDBYPIPELINE", 6497 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6498 "available": True, 6499 "function_name": "calculation_find_by_pipeline", 6500 "function_params": ["findbypipeline"], 6501 }, 6502 "FINDBYSAMPLE": { 6503 "type": "python", 6504 "name": "FINDBYSAMPLE", 6505 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6506 "available": True, 6507 "function_name": "calculation_find_by_pipeline", 6508 "function_params": ["findbysample"], 6509 }, 6510 "GENOTYPECONCORDANCE": { 6511 "type": "python", 6512 "name": "GENOTYPECONCORDANCE", 6513 "description": "Concordance of genotype for multi caller VCF", 6514 "available": True, 6515 "function_name": "calculation_genotype_concordance", 6516 "function_params": [], 6517 }, 6518 "BARCODE": { 6519 "type": "python", 6520 "name": "BARCODE", 6521 "description": "BARCODE as VaRank tool", 6522 "available": True, 6523 "function_name": "calculation_barcode", 6524 "function_params": [], 6525 }, 6526 "BARCODEFAMILY": { 6527 "type": "python", 6528 "name": "BARCODEFAMILY", 6529 "description": "BARCODEFAMILY as VaRank tool", 6530 "available": True, 6531 "function_name": "calculation_barcode_family", 6532 "function_params": ["BCF"], 6533 }, 6534 "TRIO": { 6535 "type": "python", 6536 "name": "TRIO", 6537 "description": "Inheritance for a trio family", 6538 "available": True, 6539 "function_name": "calculation_trio", 6540 "function_params": [], 6541 }, 6542 "VAF": { 6543 "type": "python", 6544 "name": "VAF", 6545 "description": "Variant Allele Frequency (VAF) harmonization", 6546 "available": True, 6547 "function_name": "calculation_vaf_normalization", 6548 "function_params": [], 6549 }, 6550 "VAF_stats": { 6551 "type": "python", 6552 "name": "VAF_stats", 6553 "description": "Variant Allele Frequency (VAF) statistics", 6554 "available": True, 6555 "function_name": "calculation_genotype_stats", 6556 "function_params": ["VAF"], 6557 }, 6558 "DP_stats": { 6559 "type": "python", 6560 "name": "DP_stats", 6561 "description": "Depth (DP) statistics", 6562 "available": True, 6563 "function_name": "calculation_genotype_stats", 6564 "function_params": ["DP"], 6565 }, 6566 "variant_id": { 6567 "type": "python", 6568 "name": "variant_id", 6569 "description": "Variant ID generated from variant position and type", 6570 "available": True, 6571 "function_name": "calculation_variant_id", 6572 "function_params": [], 6573 }, 6574 "transcripts_json": { 6575 "type": "python", 6576 "name": "transcripts_json", 6577 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6578 "available": True, 6579 "function_name": "calculation_transcripts_annotation", 6580 "function_params": ["transcripts_json", None], 6581 }, 6582 "transcripts_ann": { 6583 "type": "python", 6584 "name": "transcripts_ann", 6585 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6586 "available": True, 6587 "function_name": "calculation_transcripts_annotation", 6588 "function_params": [None, "transcripts_ann"], 6589 }, 6590 "transcripts_annotations": { 6591 "type": "python", 6592 "name": "transcripts_annotations", 6593 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6594 "available": True, 6595 "function_name": "calculation_transcripts_annotation", 6596 "function_params": [None, None], 6597 }, 6598 "transcripts_prioritization": { 6599 "type": "python", 6600 "name": "transcripts_prioritization", 6601 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6602 "available": True, 6603 "function_name": "calculation_transcripts_prioritization", 6604 "function_params": [], 6605 }, 6606 }, 6607 "prioritizations": { 6608 "default": { 6609 "ANN2": [ 6610 { 6611 "type": "contains", 6612 "value": "HIGH", 6613 "score": 5, 6614 "flag": "PASS", 6615 "comment": [ 6616 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6617 ], 6618 }, 6619 { 6620 "type": "contains", 6621 "value": "MODERATE", 6622 "score": 3, 6623 "flag": "PASS", 6624 "comment": [ 6625 "A non-disruptive variant that might change protein effectiveness" 6626 ], 6627 }, 6628 { 6629 "type": "contains", 6630 "value": "LOW", 6631 "score": 0, 6632 "flag": "FILTERED", 6633 "comment": [ 6634 "Assumed to be mostly harmless or unlikely to change protein behavior" 6635 ], 6636 }, 6637 { 6638 "type": "contains", 6639 "value": "MODIFIER", 6640 "score": 0, 6641 "flag": "FILTERED", 6642 "comment": [ 6643 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6644 ], 6645 }, 6646 ], 6647 } 6648 }, 6649 } 6650 6651 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
6653 def get_config_json( 6654 self, name: str, config_dict: dict = {}, config_file: str = None 6655 ) -> dict: 6656 """ 6657 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6658 default values, a dictionary, and a file. 6659 6660 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6661 the name of the configuration. It is used to identify and retrieve the configuration settings 6662 for a specific component or module 6663 :type name: str 6664 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6665 dictionary that allows you to provide additional configuration settings or overrides. When you 6666 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6667 the key is the configuration setting you want to override or 6668 :type config_dict: dict 6669 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6670 specify the path to a configuration file that contains additional settings. If provided, the 6671 function will read the contents of this file and update the configuration dictionary with the 6672 values found in the file, overriding any existing values with the 6673 :type config_file: str 6674 :return: The function `get_config_json` returns a dictionary containing the configuration 6675 settings. 6676 """ 6677 6678 # Create with default prioritizations 6679 config_default = self.get_config_default(name=name) 6680 configuration = config_default 6681 # log.debug(f"configuration={configuration}") 6682 6683 # Replace prioritizations from dict 6684 for config in config_dict: 6685 configuration[config] = config_dict[config] 6686 6687 # Replace prioritizations from file 6688 config_file = full_path(config_file) 6689 if config_file: 6690 if os.path.exists(config_file): 6691 with open(config_file) as config_file_content: 6692 config_file_dict = json.load(config_file_content) 6693 for config in config_file_dict: 6694 configuration[config] = config_file_dict[config] 6695 else: 6696 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6697 log.error(msg_error) 6698 raise ValueError(msg_error) 6699 6700 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
6702 def prioritization( 6703 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6704 ) -> bool: 6705 """ 6706 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6707 prioritizes variants based on configured profiles and criteria. 6708 6709 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6710 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6711 a table name is provided, the method will prioritize the variants in that specific table 6712 :type table: str 6713 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6714 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6715 provided, the code will use a default prefix value of "PZ" 6716 :type pz_prefix: str 6717 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6718 additional parameters specific to the prioritization process. These parameters can include 6719 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6720 configurations needed for the prioritization of variants in a V 6721 :type pz_param: dict 6722 :return: A boolean value (True) is being returned from the `prioritization` function. 6723 """ 6724 6725 # Config 6726 config = self.get_config() 6727 6728 # Param 6729 param = self.get_param() 6730 6731 # Prioritization param 6732 if pz_param is not None: 6733 prioritization_param = pz_param 6734 else: 6735 prioritization_param = param.get("prioritization", {}) 6736 6737 # Configuration profiles 6738 prioritization_config_file = prioritization_param.get( 6739 "prioritization_config", None 6740 ) 6741 prioritization_config_file = full_path(prioritization_config_file) 6742 prioritizations_config = self.get_config_json( 6743 name="prioritizations", config_file=prioritization_config_file 6744 ) 6745 6746 # Prioritization prefix 6747 pz_prefix_default = "PZ" 6748 if pz_prefix is None: 6749 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6750 6751 # Prioritization options 6752 profiles = prioritization_param.get("profiles", []) 6753 if isinstance(profiles, str): 6754 profiles = profiles.split(",") 6755 pzfields = prioritization_param.get( 6756 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6757 ) 6758 if isinstance(pzfields, str): 6759 pzfields = pzfields.split(",") 6760 default_profile = prioritization_param.get("default_profile", None) 6761 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6762 prioritization_score_mode = prioritization_param.get( 6763 "prioritization_score_mode", "HOWARD" 6764 ) 6765 6766 # Quick Prioritizations 6767 prioritizations = param.get("prioritizations", None) 6768 if prioritizations: 6769 log.info("Quick Prioritization:") 6770 for profile in prioritizations.split(","): 6771 if profile not in profiles: 6772 profiles.append(profile) 6773 log.info(f" {profile}") 6774 6775 # If profile "ALL" provided, all profiles in the config profiles 6776 if "ALL" in profiles: 6777 profiles = list(prioritizations_config.keys()) 6778 6779 for profile in profiles: 6780 if prioritizations_config.get(profile, None): 6781 log.debug(f"Profile '{profile}' configured") 6782 else: 6783 msg_error = f"Profile '{profile}' NOT configured" 6784 log.error(msg_error) 6785 raise ValueError(msg_error) 6786 6787 if profiles: 6788 log.info(f"Prioritization... ") 6789 else: 6790 log.debug(f"No profile defined") 6791 return False 6792 6793 if not default_profile and len(profiles): 6794 default_profile = profiles[0] 6795 6796 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6797 log.debug("Profiles to check: " + str(list(profiles))) 6798 6799 # Variables 6800 if table is not None: 6801 table_variants = table 6802 else: 6803 table_variants = self.get_table_variants(clause="update") 6804 log.debug(f"Table to prioritize: {table_variants}") 6805 6806 # Added columns 6807 added_columns = [] 6808 6809 # Create list of PZfields 6810 # List of PZFields 6811 list_of_pzfields_original = pzfields + [ 6812 pzfield + pzfields_sep + profile 6813 for pzfield in pzfields 6814 for profile in profiles 6815 ] 6816 list_of_pzfields = [] 6817 log.debug(f"{list_of_pzfields_original}") 6818 6819 # Remove existing PZfields to use if exists 6820 for pzfield in list_of_pzfields_original: 6821 if self.get_header().infos.get(pzfield, None) is None: 6822 list_of_pzfields.append(pzfield) 6823 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6824 else: 6825 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6826 6827 if list_of_pzfields: 6828 6829 # Explode Infos prefix 6830 explode_infos_prefix = self.get_explode_infos_prefix() 6831 6832 # PZfields tags description 6833 PZfields_INFOS = { 6834 f"{pz_prefix}Tags": { 6835 "ID": f"{pz_prefix}Tags", 6836 "Number": ".", 6837 "Type": "String", 6838 "Description": "Variant tags based on annotation criteria", 6839 }, 6840 f"{pz_prefix}Score": { 6841 "ID": f"{pz_prefix}Score", 6842 "Number": 1, 6843 "Type": "Integer", 6844 "Description": "Variant score based on annotation criteria", 6845 }, 6846 f"{pz_prefix}Flag": { 6847 "ID": f"{pz_prefix}Flag", 6848 "Number": 1, 6849 "Type": "String", 6850 "Description": "Variant flag based on annotation criteria", 6851 }, 6852 f"{pz_prefix}Comment": { 6853 "ID": f"{pz_prefix}Comment", 6854 "Number": ".", 6855 "Type": "String", 6856 "Description": "Variant comment based on annotation criteria", 6857 }, 6858 f"{pz_prefix}Infos": { 6859 "ID": f"{pz_prefix}Infos", 6860 "Number": ".", 6861 "Type": "String", 6862 "Description": "Variant infos based on annotation criteria", 6863 }, 6864 f"{pz_prefix}Class": { 6865 "ID": f"{pz_prefix}Class", 6866 "Number": ".", 6867 "Type": "String", 6868 "Description": "Variant class based on annotation criteria", 6869 }, 6870 } 6871 6872 # Create INFO fields if not exist 6873 for field in PZfields_INFOS: 6874 field_ID = PZfields_INFOS[field]["ID"] 6875 field_description = PZfields_INFOS[field]["Description"] 6876 if field_ID not in self.get_header().infos and field_ID in pzfields: 6877 field_description = ( 6878 PZfields_INFOS[field]["Description"] 6879 + f", profile {default_profile}" 6880 ) 6881 self.get_header().infos[field_ID] = vcf.parser._Info( 6882 field_ID, 6883 PZfields_INFOS[field]["Number"], 6884 PZfields_INFOS[field]["Type"], 6885 field_description, 6886 "unknown", 6887 "unknown", 6888 code_type_map[PZfields_INFOS[field]["Type"]], 6889 ) 6890 6891 # Create INFO fields if not exist for each profile 6892 for profile in prioritizations_config: 6893 if profile in profiles or profiles == []: 6894 for field in PZfields_INFOS: 6895 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6896 field_description = ( 6897 PZfields_INFOS[field]["Description"] 6898 + f", profile {profile}" 6899 ) 6900 if ( 6901 field_ID not in self.get_header().infos 6902 and field in pzfields 6903 ): 6904 self.get_header().infos[field_ID] = vcf.parser._Info( 6905 field_ID, 6906 PZfields_INFOS[field]["Number"], 6907 PZfields_INFOS[field]["Type"], 6908 field_description, 6909 "unknown", 6910 "unknown", 6911 code_type_map[PZfields_INFOS[field]["Type"]], 6912 ) 6913 6914 # Header 6915 for pzfield in list_of_pzfields: 6916 if re.match(f"{pz_prefix}Score.*", pzfield): 6917 added_column = self.add_column( 6918 table_name=table_variants, 6919 column_name=pzfield, 6920 column_type="INTEGER", 6921 default_value="0", 6922 ) 6923 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6924 added_column = self.add_column( 6925 table_name=table_variants, 6926 column_name=pzfield, 6927 column_type="BOOLEAN", 6928 default_value="1", 6929 ) 6930 elif re.match(f"{pz_prefix}Class.*", pzfield): 6931 added_column = self.add_column( 6932 table_name=table_variants, 6933 column_name=pzfield, 6934 column_type="VARCHAR[]", 6935 default_value="null", 6936 ) 6937 else: 6938 added_column = self.add_column( 6939 table_name=table_variants, 6940 column_name=pzfield, 6941 column_type="STRING", 6942 default_value="''", 6943 ) 6944 added_columns.append(added_column) 6945 6946 # Profiles 6947 if profiles: 6948 6949 # foreach profile in configuration file 6950 for profile in prioritizations_config: 6951 6952 # If profile is asked in param, or ALL are asked (empty profile []) 6953 if profile in profiles or profiles == []: 6954 log.info(f"Profile '{profile}'") 6955 6956 sql_set_info_option = "" 6957 6958 sql_set_info = [] 6959 6960 # PZ fields set 6961 6962 # PZScore 6963 if ( 6964 f"{pz_prefix}Score{pzfields_sep}{profile}" 6965 in list_of_pzfields 6966 ): 6967 sql_set_info.append( 6968 f""" 6969 concat( 6970 '{pz_prefix}Score{pzfields_sep}{profile}=', 6971 {pz_prefix}Score{pzfields_sep}{profile} 6972 ) 6973 """ 6974 ) 6975 if ( 6976 profile == default_profile 6977 and f"{pz_prefix}Score" in list_of_pzfields 6978 ): 6979 sql_set_info.append( 6980 f""" 6981 concat( 6982 '{pz_prefix}Score=', 6983 {pz_prefix}Score{pzfields_sep}{profile} 6984 ) 6985 """ 6986 ) 6987 6988 # PZFlag 6989 if ( 6990 f"{pz_prefix}Flag{pzfields_sep}{profile}" 6991 in list_of_pzfields 6992 ): 6993 sql_set_info.append( 6994 f""" 6995 concat( 6996 '{pz_prefix}Flag{pzfields_sep}{profile}=', 6997 CASE 6998 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6999 THEN 'PASS' 7000 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7001 THEN 'FILTERED' 7002 END 7003 ) 7004 """ 7005 ) 7006 if ( 7007 profile == default_profile 7008 and f"{pz_prefix}Flag" in list_of_pzfields 7009 ): 7010 sql_set_info.append( 7011 f""" 7012 concat( 7013 '{pz_prefix}Flag=', 7014 CASE 7015 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7016 THEN 'PASS' 7017 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7018 THEN 'FILTERED' 7019 END 7020 ) 7021 """ 7022 ) 7023 7024 # PZClass 7025 if ( 7026 f"{pz_prefix}Class{pzfields_sep}{profile}" 7027 in list_of_pzfields 7028 ): 7029 sql_set_info.append( 7030 f""" 7031 concat( 7032 '{pz_prefix}Class{pzfields_sep}{profile}=', 7033 CASE 7034 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7035 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7036 ELSE '.' 7037 END 7038 ) 7039 7040 """ 7041 ) 7042 if ( 7043 profile == default_profile 7044 and f"{pz_prefix}Class" in list_of_pzfields 7045 ): 7046 sql_set_info.append( 7047 f""" 7048 concat( 7049 '{pz_prefix}Class=', 7050 CASE 7051 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7052 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7053 ELSE '.' 7054 END 7055 ) 7056 """ 7057 ) 7058 7059 # PZComment 7060 if ( 7061 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7062 in list_of_pzfields 7063 ): 7064 sql_set_info.append( 7065 f""" 7066 CASE 7067 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7068 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7069 ELSE '' 7070 END 7071 """ 7072 ) 7073 if ( 7074 profile == default_profile 7075 and f"{pz_prefix}Comment" in list_of_pzfields 7076 ): 7077 sql_set_info.append( 7078 f""" 7079 CASE 7080 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7081 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7082 ELSE '' 7083 END 7084 """ 7085 ) 7086 7087 # PZInfos 7088 if ( 7089 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7090 in list_of_pzfields 7091 ): 7092 sql_set_info.append( 7093 f""" 7094 CASE 7095 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7096 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7097 ELSE '' 7098 END 7099 """ 7100 ) 7101 if ( 7102 profile == default_profile 7103 and f"{pz_prefix}Infos" in list_of_pzfields 7104 ): 7105 sql_set_info.append( 7106 f""" 7107 CASE 7108 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7109 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7110 ELSE '' 7111 END 7112 """ 7113 ) 7114 7115 # Merge PZfields 7116 sql_set_info_option = "" 7117 sql_set_sep = "" 7118 for sql_set in sql_set_info: 7119 if sql_set_sep: 7120 sql_set_info_option += f""" 7121 , concat('{sql_set_sep}', {sql_set}) 7122 """ 7123 else: 7124 sql_set_info_option += f""" 7125 , {sql_set} 7126 """ 7127 sql_set_sep = ";" 7128 7129 sql_queries = [] 7130 for annotation in prioritizations_config[profile]: 7131 7132 # skip special sections 7133 if annotation.startswith("_"): 7134 continue 7135 7136 # For each criterions 7137 for criterion in prioritizations_config[profile][ 7138 annotation 7139 ]: 7140 7141 # Criterion mode 7142 criterion_mode = None 7143 if np.any( 7144 np.isin(list(criterion.keys()), ["type", "value"]) 7145 ): 7146 criterion_mode = "operation" 7147 elif np.any( 7148 np.isin(list(criterion.keys()), ["sql", "fields"]) 7149 ): 7150 criterion_mode = "sql" 7151 log.debug(f"Criterion Mode: {criterion_mode}") 7152 7153 # Criterion parameters 7154 criterion_type = criterion.get("type", None) 7155 criterion_value = criterion.get("value", None) 7156 criterion_sql = criterion.get("sql", None) 7157 criterion_fields = criterion.get("fields", None) 7158 criterion_score = criterion.get("score", 0) 7159 criterion_flag = criterion.get("flag", "PASS") 7160 criterion_class = criterion.get("class", None) 7161 criterion_flag_bool = criterion_flag == "PASS" 7162 criterion_comment = ( 7163 ", ".join(criterion.get("comment", [])) 7164 .replace("'", "''") 7165 .replace(";", ",") 7166 .replace("\t", " ") 7167 ) 7168 criterion_infos = ( 7169 str(criterion) 7170 .replace("'", "''") 7171 .replace(";", ",") 7172 .replace("\t", " ") 7173 ) 7174 7175 # SQL 7176 if criterion_sql is not None and isinstance( 7177 criterion_sql, list 7178 ): 7179 criterion_sql = " ".join(criterion_sql) 7180 7181 # Fields and explode 7182 if criterion_fields is None: 7183 criterion_fields = [annotation] 7184 if not isinstance(criterion_fields, list): 7185 criterion_fields = str(criterion_fields).split(",") 7186 7187 # Class 7188 if criterion_class is not None and not isinstance( 7189 criterion_class, list 7190 ): 7191 criterion_class = str(criterion_class).split(",") 7192 7193 for annotation_field in criterion_fields: 7194 7195 # Explode specific annotation 7196 log.debug( 7197 f"Explode annotation '{annotation_field}'" 7198 ) 7199 added_columns += self.explode_infos( 7200 prefix=explode_infos_prefix, 7201 fields=[annotation_field], 7202 table=table_variants, 7203 ) 7204 extra_infos = self.get_extra_infos( 7205 table=table_variants 7206 ) 7207 7208 # Check if annotation field is present 7209 if ( 7210 f"{explode_infos_prefix}{annotation_field}" 7211 not in extra_infos 7212 ): 7213 msq_err = f"Annotation '{annotation_field}' not in data" 7214 log.error(msq_err) 7215 raise ValueError(msq_err) 7216 else: 7217 log.debug( 7218 f"Annotation '{annotation_field}' in data" 7219 ) 7220 7221 sql_set = [] 7222 sql_set_info = [] 7223 7224 # PZ fields set 7225 7226 # PZScore 7227 if ( 7228 f"{pz_prefix}Score{pzfields_sep}{profile}" 7229 in list_of_pzfields 7230 ): 7231 # if prioritization_score_mode == "HOWARD": 7232 # sql_set.append( 7233 # f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7234 # ) 7235 # VaRank prioritization score mode 7236 if prioritization_score_mode == "VaRank": 7237 sql_set.append( 7238 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7239 ) 7240 # default HOWARD prioritization score mode 7241 else: 7242 sql_set.append( 7243 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7244 ) 7245 7246 # PZFlag 7247 if ( 7248 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7249 in list_of_pzfields 7250 ): 7251 sql_set.append( 7252 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7253 ) 7254 7255 # PZClass 7256 if ( 7257 f"{pz_prefix}Class{pzfields_sep}{profile}" 7258 in list_of_pzfields 7259 and criterion_class is not None 7260 ): 7261 sql_set.append( 7262 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7263 ) 7264 7265 # PZComment 7266 if ( 7267 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7268 in list_of_pzfields 7269 ): 7270 sql_set.append( 7271 f""" 7272 {pz_prefix}Comment{pzfields_sep}{profile} = 7273 concat( 7274 {pz_prefix}Comment{pzfields_sep}{profile}, 7275 CASE 7276 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7277 THEN ', ' 7278 ELSE '' 7279 END, 7280 '{criterion_comment}' 7281 ) 7282 """ 7283 ) 7284 7285 # PZInfos 7286 if ( 7287 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7288 in list_of_pzfields 7289 ): 7290 sql_set.append( 7291 f""" 7292 {pz_prefix}Infos{pzfields_sep}{profile} = 7293 concat( 7294 {pz_prefix}Infos{pzfields_sep}{profile}, 7295 '{criterion_infos}' 7296 ) 7297 """ 7298 ) 7299 sql_set_option = ",".join(sql_set) 7300 7301 # Criterion and comparison 7302 if sql_set_option: 7303 7304 if criterion_mode in ["operation"]: 7305 7306 try: 7307 float(criterion_value) 7308 sql_update = f""" 7309 UPDATE {table_variants} 7310 SET {sql_set_option} 7311 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7312 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7313 """ 7314 except: 7315 contains_option = "" 7316 if criterion_type == "contains": 7317 contains_option = ".*" 7318 sql_update = f""" 7319 UPDATE {table_variants} 7320 SET {sql_set_option} 7321 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7322 """ 7323 sql_queries.append(sql_update) 7324 7325 elif criterion_mode in ["sql"]: 7326 7327 sql_update = f""" 7328 UPDATE {table_variants} 7329 SET {sql_set_option} 7330 WHERE {criterion_sql} 7331 """ 7332 sql_queries.append(sql_update) 7333 7334 else: 7335 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7336 log.error(msg_err) 7337 raise ValueError(msg_err) 7338 7339 else: 7340 log.warning( 7341 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7342 ) 7343 7344 # PZTags 7345 if ( 7346 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7347 in list_of_pzfields 7348 ): 7349 7350 # Create PZFalgs value 7351 pztags_value = "" 7352 pztags_sep_default = "," 7353 pztags_sep = "" 7354 for pzfield in pzfields: 7355 if pzfield not in [f"{pz_prefix}Tags"]: 7356 if ( 7357 f"{pzfield}{pzfields_sep}{profile}" 7358 in list_of_pzfields 7359 ): 7360 if pzfield in [f"{pz_prefix}Flag"]: 7361 pztags_value += f"""{pztags_sep}{pzfield}#', 7362 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7363 THEN 'PASS' 7364 ELSE 'FILTERED' 7365 END, '""" 7366 elif pzfield in [f"{pz_prefix}Class"]: 7367 pztags_value += f"""{pztags_sep}{pzfield}#', 7368 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7369 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7370 ELSE '.' 7371 END, '""" 7372 else: 7373 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7374 pztags_sep = pztags_sep_default 7375 7376 # Add Query update for PZFlags 7377 sql_update_pztags = f""" 7378 UPDATE {table_variants} 7379 SET INFO = concat( 7380 INFO, 7381 CASE WHEN INFO NOT in ('','.') 7382 THEN ';' 7383 ELSE '' 7384 END, 7385 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7386 ) 7387 """ 7388 sql_queries.append(sql_update_pztags) 7389 7390 # Add Query update for PZFlags for default 7391 if profile == default_profile: 7392 sql_update_pztags_default = f""" 7393 UPDATE {table_variants} 7394 SET INFO = concat( 7395 INFO, 7396 ';', 7397 '{pz_prefix}Tags={pztags_value}' 7398 ) 7399 """ 7400 sql_queries.append(sql_update_pztags_default) 7401 7402 log.info(f"""Profile '{profile}' - Prioritization... """) 7403 7404 if sql_queries: 7405 7406 for sql_query in sql_queries: 7407 log.debug( 7408 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7409 ) 7410 self.conn.execute(sql_query) 7411 7412 log.info(f"""Profile '{profile}' - Update... """) 7413 sql_query_update = f""" 7414 UPDATE {table_variants} 7415 SET INFO = 7416 concat( 7417 CASE 7418 WHEN INFO NOT IN ('','.') 7419 THEN concat(INFO, ';') 7420 ELSE '' 7421 END 7422 {sql_set_info_option} 7423 ) 7424 """ 7425 self.conn.execute(sql_query_update) 7426 7427 else: 7428 7429 log.warning(f"No profiles in parameters") 7430 7431 # Remove added columns 7432 for added_column in added_columns: 7433 self.drop_column(column=added_column) 7434 7435 # Explode INFOS fields into table fields 7436 if self.get_explode_infos(): 7437 self.explode_infos( 7438 prefix=self.get_explode_infos_prefix(), 7439 fields=self.get_explode_infos_fields(), 7440 force=True, 7441 ) 7442 7443 return True
The prioritization function in Python processes VCF files, adds new INFO fields, and
prioritizes variants based on configured profiles and criteria.
Parameters
- table: The
tableparameter in theprioritizationfunction is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table - pz_prefix: The
pz_prefixparameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ" - pz_param: The
pz_paramparameter in theprioritizationmethod is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns
A boolean value (True) is being returned from the
prioritizationfunction.
7449 def annotation_hgvs(self, threads: int = None) -> None: 7450 """ 7451 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7452 coordinates and alleles. 7453 7454 :param threads: The `threads` parameter is an optional integer that specifies the number of 7455 threads to use for parallel processing. If no value is provided, it will default to the number 7456 of threads obtained from the `get_threads()` method 7457 :type threads: int 7458 """ 7459 7460 # Function for each partition of the Dask Dataframe 7461 def partition_function(partition): 7462 """ 7463 The function `partition_function` applies the `annotation_hgvs_partition` function to 7464 each row of a DataFrame called `partition`. 7465 7466 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7467 to be processed 7468 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7469 the "partition" dataframe along the axis 1. 7470 """ 7471 return partition.apply(annotation_hgvs_partition, axis=1) 7472 7473 def annotation_hgvs_partition(row) -> str: 7474 """ 7475 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7476 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7477 7478 :param row: A dictionary-like object that contains the values for the following keys: 7479 :return: a string that contains the HGVS names associated with the given row of data. 7480 """ 7481 7482 chr = row["CHROM"] 7483 pos = row["POS"] 7484 ref = row["REF"] 7485 alt = row["ALT"] 7486 7487 # Find list of associated transcripts 7488 transcripts_list = list( 7489 polars_conn.execute( 7490 f""" 7491 SELECT transcript 7492 FROM refseq_df 7493 WHERE CHROM='{chr}' 7494 AND POS={pos} 7495 """ 7496 )["transcript"] 7497 ) 7498 7499 # Full HGVS annotation in list 7500 hgvs_full_list = [] 7501 7502 for transcript_name in transcripts_list: 7503 7504 # Transcript 7505 transcript = get_transcript( 7506 transcripts=transcripts, transcript_name=transcript_name 7507 ) 7508 # Exon 7509 if use_exon: 7510 exon = transcript.find_exon_number(pos) 7511 else: 7512 exon = None 7513 # Protein 7514 transcript_protein = None 7515 if use_protein or add_protein or full_format: 7516 transcripts_protein = list( 7517 polars_conn.execute( 7518 f""" 7519 SELECT protein 7520 FROM refseqlink_df 7521 WHERE transcript='{transcript_name}' 7522 LIMIT 1 7523 """ 7524 )["protein"] 7525 ) 7526 if len(transcripts_protein): 7527 transcript_protein = transcripts_protein[0] 7528 7529 # HGVS name 7530 hgvs_name = format_hgvs_name( 7531 chr, 7532 pos, 7533 ref, 7534 alt, 7535 genome=genome, 7536 transcript=transcript, 7537 transcript_protein=transcript_protein, 7538 exon=exon, 7539 use_gene=use_gene, 7540 use_protein=use_protein, 7541 full_format=full_format, 7542 use_version=use_version, 7543 codon_type=codon_type, 7544 ) 7545 hgvs_full_list.append(hgvs_name) 7546 if add_protein and not use_protein and not full_format: 7547 hgvs_name = format_hgvs_name( 7548 chr, 7549 pos, 7550 ref, 7551 alt, 7552 genome=genome, 7553 transcript=transcript, 7554 transcript_protein=transcript_protein, 7555 exon=exon, 7556 use_gene=use_gene, 7557 use_protein=True, 7558 full_format=False, 7559 use_version=use_version, 7560 codon_type=codon_type, 7561 ) 7562 hgvs_full_list.append(hgvs_name) 7563 7564 # Create liste of HGVS annotations 7565 hgvs_full = ",".join(hgvs_full_list) 7566 7567 return hgvs_full 7568 7569 # Polars connexion 7570 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7571 7572 # Config 7573 config = self.get_config() 7574 7575 # Databases 7576 # Genome 7577 databases_genomes_folders = ( 7578 config.get("folders", {}) 7579 .get("databases", {}) 7580 .get("genomes", DEFAULT_GENOME_FOLDER) 7581 ) 7582 databases_genome = ( 7583 config.get("folders", {}).get("databases", {}).get("genomes", "") 7584 ) 7585 # refseq database folder 7586 databases_refseq_folders = ( 7587 config.get("folders", {}) 7588 .get("databases", {}) 7589 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7590 ) 7591 # refseq 7592 databases_refseq = config.get("databases", {}).get("refSeq", None) 7593 # refSeqLink 7594 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7595 7596 # Param 7597 param = self.get_param() 7598 7599 # Quick HGVS 7600 if "hgvs_options" in param and param.get("hgvs_options", ""): 7601 log.info(f"Quick HGVS Annotation:") 7602 if not param.get("hgvs", None): 7603 param["hgvs"] = {} 7604 for option in param.get("hgvs_options", "").split(","): 7605 option_var_val = option.split("=") 7606 option_var = option_var_val[0] 7607 if len(option_var_val) > 1: 7608 option_val = option_var_val[1] 7609 else: 7610 option_val = "True" 7611 if option_val.upper() in ["TRUE"]: 7612 option_val = True 7613 elif option_val.upper() in ["FALSE"]: 7614 option_val = False 7615 log.info(f" {option_var}={option_val}") 7616 param["hgvs"][option_var] = option_val 7617 7618 # Check if HGVS annotation enabled 7619 if "hgvs" in param: 7620 log.info(f"HGVS Annotation... ") 7621 for hgvs_option in param.get("hgvs", {}): 7622 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7623 else: 7624 return 7625 7626 # HGVS Param 7627 param_hgvs = param.get("hgvs", {}) 7628 use_exon = param_hgvs.get("use_exon", False) 7629 use_gene = param_hgvs.get("use_gene", False) 7630 use_protein = param_hgvs.get("use_protein", False) 7631 add_protein = param_hgvs.get("add_protein", False) 7632 full_format = param_hgvs.get("full_format", False) 7633 use_version = param_hgvs.get("use_version", False) 7634 codon_type = param_hgvs.get("codon_type", "3") 7635 7636 # refSseq refSeqLink 7637 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7638 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7639 7640 # Assembly 7641 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7642 7643 # Genome 7644 genome_file = None 7645 if find_genome(databases_genome): 7646 genome_file = find_genome(databases_genome) 7647 else: 7648 genome_file = find_genome( 7649 genome_path=databases_genomes_folders, assembly=assembly 7650 ) 7651 log.debug("Genome: " + str(genome_file)) 7652 7653 # refSseq 7654 refseq_file = find_file_prefix( 7655 input_file=databases_refseq, 7656 prefix="ncbiRefSeq", 7657 folder=databases_refseq_folders, 7658 assembly=assembly, 7659 ) 7660 log.debug("refSeq: " + str(refseq_file)) 7661 7662 # refSeqLink 7663 refseqlink_file = find_file_prefix( 7664 input_file=databases_refseqlink, 7665 prefix="ncbiRefSeqLink", 7666 folder=databases_refseq_folders, 7667 assembly=assembly, 7668 ) 7669 log.debug("refSeqLink: " + str(refseqlink_file)) 7670 7671 # Threads 7672 if not threads: 7673 threads = self.get_threads() 7674 log.debug("Threads: " + str(threads)) 7675 7676 # Variables 7677 table_variants = self.get_table_variants(clause="update") 7678 7679 # Get variants SNV and InDel only 7680 query_variants = f""" 7681 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7682 FROM {table_variants} 7683 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7684 """ 7685 df_variants = self.get_query_to_df(query_variants) 7686 7687 # Added columns 7688 added_columns = [] 7689 7690 # Add hgvs column in variants table 7691 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7692 added_column = self.add_column( 7693 table_variants, hgvs_column_name, "STRING", default_value=None 7694 ) 7695 added_columns.append(added_column) 7696 7697 log.debug(f"refSeq loading...") 7698 # refSeq in duckDB 7699 refseq_table = get_refseq_table( 7700 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7701 ) 7702 # Loading all refSeq in Dataframe 7703 refseq_query = f""" 7704 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7705 FROM {refseq_table} 7706 JOIN df_variants ON ( 7707 {refseq_table}.chrom = df_variants.CHROM 7708 AND {refseq_table}.txStart<=df_variants.POS 7709 AND {refseq_table}.txEnd>=df_variants.POS 7710 ) 7711 """ 7712 refseq_df = self.conn.query(refseq_query).pl() 7713 7714 if refseqlink_file: 7715 log.debug(f"refSeqLink loading...") 7716 # refSeqLink in duckDB 7717 refseqlink_table = get_refseq_table( 7718 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7719 ) 7720 # Loading all refSeqLink in Dataframe 7721 protacc_column = "protAcc_with_ver" 7722 mrnaacc_column = "mrnaAcc_with_ver" 7723 refseqlink_query = f""" 7724 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7725 FROM {refseqlink_table} 7726 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7727 WHERE protAcc_without_ver IS NOT NULL 7728 """ 7729 # Polars Dataframe 7730 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7731 7732 # Read RefSeq transcripts into a python dict/model. 7733 log.debug(f"Transcripts loading...") 7734 with tempfile.TemporaryDirectory() as tmpdir: 7735 transcripts_query = f""" 7736 COPY ( 7737 SELECT {refseq_table}.* 7738 FROM {refseq_table} 7739 JOIN df_variants ON ( 7740 {refseq_table}.chrom=df_variants.CHROM 7741 AND {refseq_table}.txStart<=df_variants.POS 7742 AND {refseq_table}.txEnd>=df_variants.POS 7743 ) 7744 ) 7745 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7746 """ 7747 self.conn.query(transcripts_query) 7748 with open(f"{tmpdir}/transcript.tsv") as infile: 7749 transcripts = read_transcripts(infile) 7750 7751 # Polars connexion 7752 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7753 7754 log.debug("Genome loading...") 7755 # Read genome sequence using pyfaidx. 7756 genome = Fasta(genome_file) 7757 7758 log.debug("Start annotation HGVS...") 7759 7760 # Create 7761 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7762 ddf = dd.from_pandas(df_variants, npartitions=threads) 7763 7764 # Use dask.dataframe.apply() to apply function on each partition 7765 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7766 7767 # Convert Dask DataFrame to Pandas Dataframe 7768 df = ddf.compute() 7769 7770 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7771 with tempfile.TemporaryDirectory() as tmpdir: 7772 df_parquet = os.path.join(tmpdir, "df.parquet") 7773 df.to_parquet(df_parquet) 7774 7775 # Update hgvs column 7776 update_variant_query = f""" 7777 UPDATE {table_variants} 7778 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7779 FROM read_parquet('{df_parquet}') as df 7780 WHERE variants."#CHROM" = df.CHROM 7781 AND variants.POS = df.POS 7782 AND variants.REF = df.REF 7783 AND variants.ALT = df.ALT 7784 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7785 """ 7786 self.execute_query(update_variant_query) 7787 7788 # Update INFO column 7789 sql_query_update = f""" 7790 UPDATE {table_variants} 7791 SET INFO = 7792 concat( 7793 CASE 7794 WHEN INFO NOT IN ('','.') 7795 THEN concat(INFO, ';') 7796 ELSE '' 7797 END, 7798 'hgvs=', 7799 {hgvs_column_name} 7800 ) 7801 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7802 """ 7803 self.execute_query(sql_query_update) 7804 7805 # Add header 7806 HGVS_INFOS = { 7807 "hgvs": { 7808 "ID": "hgvs", 7809 "Number": ".", 7810 "Type": "String", 7811 "Description": f"HGVS annotatation with HOWARD", 7812 } 7813 } 7814 7815 for field in HGVS_INFOS: 7816 field_ID = HGVS_INFOS[field]["ID"] 7817 field_description = HGVS_INFOS[field]["Description"] 7818 self.get_header().infos[field_ID] = vcf.parser._Info( 7819 field_ID, 7820 HGVS_INFOS[field]["Number"], 7821 HGVS_INFOS[field]["Type"], 7822 field_description, 7823 "unknown", 7824 "unknown", 7825 code_type_map[HGVS_INFOS[field]["Type"]], 7826 ) 7827 7828 # Remove added columns 7829 for added_column in added_columns: 7830 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
7836 def get_operations_help( 7837 self, operations_config_dict: dict = {}, operations_config_file: str = None 7838 ) -> list: 7839 7840 # Init 7841 operations_help = [] 7842 7843 # operations 7844 operations = self.get_config_json( 7845 name="calculations", 7846 config_dict=operations_config_dict, 7847 config_file=operations_config_file, 7848 ) 7849 for op in operations: 7850 op_name = operations[op].get("name", op).upper() 7851 op_description = operations[op].get("description", op_name) 7852 op_available = operations[op].get("available", False) 7853 if op_available: 7854 operations_help.append(f" {op_name}: {op_description}") 7855 7856 # Sort operations 7857 operations_help.sort() 7858 7859 # insert header 7860 operations_help.insert(0, "Available calculation operations:") 7861 7862 # Return 7863 return operations_help
7865 def calculation( 7866 self, 7867 operations: dict = {}, 7868 operations_config_dict: dict = {}, 7869 operations_config_file: str = None, 7870 ) -> None: 7871 """ 7872 It takes a list of operations, and for each operation, it checks if it's a python or sql 7873 operation, and then calls the appropriate function 7874 7875 param json example: 7876 "calculation": { 7877 "NOMEN": { 7878 "options": { 7879 "hgvs_field": "hgvs" 7880 }, 7881 "middle" : null 7882 } 7883 """ 7884 7885 # Param 7886 param = self.get_param() 7887 7888 # operations config 7889 operations_config = self.get_config_json( 7890 name="calculations", 7891 config_dict=operations_config_dict, 7892 config_file=operations_config_file, 7893 ) 7894 7895 # Upper keys 7896 operations_config = {k.upper(): v for k, v in operations_config.items()} 7897 7898 # Calculations 7899 7900 # Operations from param 7901 operations = param.get("calculation", {}).get("calculations", operations) 7902 7903 # Quick calculation - add 7904 if param.get("calculations", None): 7905 calculations_list = [ 7906 value for value in param.get("calculations", "").split(",") 7907 ] 7908 log.info(f"Quick Calculations:") 7909 for calculation_key in calculations_list: 7910 log.info(f" {calculation_key}") 7911 for calculation_operation in calculations_list: 7912 if calculation_operation.upper() not in operations: 7913 operations[calculation_operation.upper()] = {} 7914 add_value_into_dict( 7915 dict_tree=param, 7916 sections=[ 7917 "calculation", 7918 "calculations", 7919 calculation_operation.upper(), 7920 ], 7921 value={}, 7922 ) 7923 7924 # Operations for calculation 7925 if not operations: 7926 operations = param.get("calculation", {}).get("calculations", {}) 7927 7928 if operations: 7929 log.info(f"Calculations...") 7930 7931 # For each operations 7932 for operation_name in operations: 7933 operation_name = operation_name.upper() 7934 if operation_name not in [""]: 7935 if operation_name in operations_config: 7936 log.info(f"Calculation '{operation_name}'") 7937 operation = operations_config[operation_name] 7938 operation_type = operation.get("type", "sql") 7939 if operation_type == "python": 7940 self.calculation_process_function( 7941 operation=operation, operation_name=operation_name 7942 ) 7943 elif operation_type == "sql": 7944 self.calculation_process_sql( 7945 operation=operation, operation_name=operation_name 7946 ) 7947 else: 7948 log.error( 7949 f"Operations config: Type '{operation_type}' NOT available" 7950 ) 7951 raise ValueError( 7952 f"Operations config: Type '{operation_type}' NOT available" 7953 ) 7954 else: 7955 log.error( 7956 f"Operations config: Calculation '{operation_name}' NOT available" 7957 ) 7958 raise ValueError( 7959 f"Operations config: Calculation '{operation_name}' NOT available" 7960 ) 7961 7962 # Explode INFOS fields into table fields 7963 if self.get_explode_infos(): 7964 self.explode_infos( 7965 prefix=self.get_explode_infos_prefix(), 7966 fields=self.get_explode_infos_fields(), 7967 force=True, 7968 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
7970 def calculation_process_sql( 7971 self, operation: dict, operation_name: str = "unknown" 7972 ) -> None: 7973 """ 7974 The `calculation_process_sql` function takes in a mathematical operation as a string and 7975 performs the operation, updating the specified table with the result. 7976 7977 :param operation: The `operation` parameter is a dictionary that contains information about the 7978 mathematical operation to be performed. It includes the following keys: 7979 :type operation: dict 7980 :param operation_name: The `operation_name` parameter is a string that represents the name of 7981 the mathematical operation being performed. It is used for logging and error handling purposes, 7982 defaults to unknown 7983 :type operation_name: str (optional) 7984 """ 7985 7986 # table variants 7987 table_variants = self.get_table_variants(clause="alter") 7988 7989 # Operation infos 7990 operation_name = operation.get("name", "unknown") 7991 log.debug(f"process sql {operation_name}") 7992 output_column_name = operation.get("output_column_name", operation_name) 7993 output_column_type = operation.get("output_column_type", "String") 7994 prefix = operation.get("explode_infos_prefix", "") 7995 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7996 output_column_description = operation.get( 7997 "output_column_description", f"{operation_name} operation" 7998 ) 7999 operation_query = operation.get("operation_query", None) 8000 if isinstance(operation_query, list): 8001 operation_query = " ".join(operation_query) 8002 operation_info_fields = operation.get("info_fields", []) 8003 operation_info_fields_check = operation.get("info_fields_check", False) 8004 operation_info = operation.get("operation_info", True) 8005 8006 if operation_query: 8007 8008 # Info fields check 8009 operation_info_fields_check_result = True 8010 if operation_info_fields_check: 8011 header_infos = self.get_header().infos 8012 for info_field in operation_info_fields: 8013 operation_info_fields_check_result = ( 8014 operation_info_fields_check_result 8015 and info_field in header_infos 8016 ) 8017 8018 # If info fields available 8019 if operation_info_fields_check_result: 8020 8021 # Added_columns 8022 added_columns = [] 8023 8024 # Create VCF header field 8025 vcf_reader = self.get_header() 8026 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8027 output_column_name, 8028 ".", 8029 output_column_type, 8030 output_column_description, 8031 "howard calculation", 8032 "0", 8033 self.code_type_map.get(output_column_type), 8034 ) 8035 8036 # Explode infos if needed 8037 log.debug(f"calculation_process_sql prefix {prefix}") 8038 added_columns += self.explode_infos( 8039 prefix=prefix, 8040 fields=[output_column_name] + operation_info_fields, 8041 force=True, 8042 ) 8043 8044 # Create column 8045 added_column = self.add_column( 8046 table_name=table_variants, 8047 column_name=prefix + output_column_name, 8048 column_type=output_column_type_sql, 8049 default_value="null", 8050 ) 8051 added_columns.append(added_column) 8052 8053 # Operation calculation 8054 try: 8055 8056 # Query to update calculation column 8057 sql_update = f""" 8058 UPDATE {table_variants} 8059 SET "{prefix}{output_column_name}" = ({operation_query}) 8060 """ 8061 self.conn.execute(sql_update) 8062 8063 # Add to INFO 8064 if operation_info: 8065 sql_update_info = f""" 8066 UPDATE {table_variants} 8067 SET "INFO" = 8068 concat( 8069 CASE 8070 WHEN "INFO" IS NOT NULL 8071 THEN concat("INFO", ';') 8072 ELSE '' 8073 END, 8074 '{output_column_name}=', 8075 "{prefix}{output_column_name}" 8076 ) 8077 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8078 """ 8079 self.conn.execute(sql_update_info) 8080 8081 except: 8082 log.error( 8083 f"Operations config: Calculation '{operation_name}' query failed" 8084 ) 8085 raise ValueError( 8086 f"Operations config: Calculation '{operation_name}' query failed" 8087 ) 8088 8089 # Remove added columns 8090 for added_column in added_columns: 8091 log.debug(f"added_column: {added_column}") 8092 self.drop_column(column=added_column) 8093 8094 else: 8095 log.error( 8096 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8097 ) 8098 raise ValueError( 8099 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8100 ) 8101 8102 else: 8103 log.error( 8104 f"Operations config: Calculation '{operation_name}' query NOT defined" 8105 ) 8106 raise ValueError( 8107 f"Operations config: Calculation '{operation_name}' query NOT defined" 8108 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
8110 def calculation_process_function( 8111 self, operation: dict, operation_name: str = "unknown" 8112 ) -> None: 8113 """ 8114 The `calculation_process_function` takes in an operation dictionary and performs the specified 8115 function with the given parameters. 8116 8117 :param operation: The `operation` parameter is a dictionary that contains information about the 8118 operation to be performed. It has the following keys: 8119 :type operation: dict 8120 :param operation_name: The `operation_name` parameter is a string that represents the name of 8121 the operation being performed. It is used for logging purposes, defaults to unknown 8122 :type operation_name: str (optional) 8123 """ 8124 8125 operation_name = operation["name"] 8126 log.debug(f"process sql {operation_name}") 8127 function_name = operation["function_name"] 8128 function_params = operation["function_params"] 8129 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
8131 def calculation_variant_id(self) -> None: 8132 """ 8133 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8134 updates the INFO field of a variants table with the variant ID. 8135 """ 8136 8137 # variant_id annotation field 8138 variant_id_tag = self.get_variant_id_column() 8139 added_columns = [variant_id_tag] 8140 8141 # variant_id hgvs tags" 8142 vcf_infos_tags = { 8143 variant_id_tag: "howard variant ID annotation", 8144 } 8145 8146 # Variants table 8147 table_variants = self.get_table_variants() 8148 8149 # Header 8150 vcf_reader = self.get_header() 8151 8152 # Add variant_id to header 8153 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8154 variant_id_tag, 8155 ".", 8156 "String", 8157 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8158 "howard calculation", 8159 "0", 8160 self.code_type_map.get("String"), 8161 ) 8162 8163 # Update 8164 sql_update = f""" 8165 UPDATE {table_variants} 8166 SET "INFO" = 8167 concat( 8168 CASE 8169 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8170 THEN '' 8171 ELSE concat("INFO", ';') 8172 END, 8173 '{variant_id_tag}=', 8174 "{variant_id_tag}" 8175 ) 8176 """ 8177 self.conn.execute(sql_update) 8178 8179 # Remove added columns 8180 for added_column in added_columns: 8181 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
8183 def calculation_extract_snpeff_hgvs( 8184 self, 8185 snpeff_hgvs: str = "snpeff_hgvs", 8186 snpeff_field: str = "ANN", 8187 ) -> None: 8188 """ 8189 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8190 annotation field in a VCF file and adds them as a new column in the variants table. 8191 8192 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8193 function is used to specify the name of the column that will store the HGVS nomenclatures 8194 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8195 snpeff_hgvs 8196 :type snpeff_hgvs: str (optional) 8197 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8198 function represents the field in the VCF file that contains SnpEff annotations. This field is 8199 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8200 to ANN 8201 :type snpeff_field: str (optional) 8202 """ 8203 8204 # Snpeff hgvs tags 8205 vcf_infos_tags = { 8206 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8207 } 8208 8209 # Prefix 8210 prefix = self.get_explode_infos_prefix() 8211 if prefix: 8212 prefix = "INFO/" 8213 8214 # snpEff fields 8215 speff_ann_infos = prefix + snpeff_field 8216 speff_hgvs_infos = prefix + snpeff_hgvs 8217 8218 # Variants table 8219 table_variants = self.get_table_variants() 8220 8221 # Header 8222 vcf_reader = self.get_header() 8223 8224 # Add columns 8225 added_columns = [] 8226 8227 # Explode HGVS field in column 8228 added_columns += self.explode_infos(fields=[snpeff_field]) 8229 8230 if snpeff_field in vcf_reader.infos: 8231 8232 log.debug(vcf_reader.infos[snpeff_field]) 8233 8234 # Extract ANN header 8235 ann_description = vcf_reader.infos[snpeff_field].desc 8236 pattern = r"'(.+?)'" 8237 match = re.search(pattern, ann_description) 8238 if match: 8239 ann_header_match = match.group(1).split(" | ") 8240 ann_header_desc = {} 8241 for i in range(len(ann_header_match)): 8242 ann_header_info = "".join( 8243 char for char in ann_header_match[i] if char.isalnum() 8244 ) 8245 ann_header_desc[ann_header_info] = ann_header_match[i] 8246 if not ann_header_desc: 8247 raise ValueError("Invalid header description format") 8248 else: 8249 raise ValueError("Invalid header description format") 8250 8251 # Create variant id 8252 variant_id_column = self.get_variant_id_column() 8253 added_columns += [variant_id_column] 8254 8255 # Create dataframe 8256 dataframe_snpeff_hgvs = self.get_query_to_df( 8257 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8258 ) 8259 8260 # Create main NOMEN column 8261 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8262 speff_ann_infos 8263 ].apply( 8264 lambda x: extract_snpeff_hgvs( 8265 str(x), header=list(ann_header_desc.values()) 8266 ) 8267 ) 8268 8269 # Add snpeff_hgvs to header 8270 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8271 snpeff_hgvs, 8272 ".", 8273 "String", 8274 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8275 "howard calculation", 8276 "0", 8277 self.code_type_map.get("String"), 8278 ) 8279 8280 # Update 8281 sql_update = f""" 8282 UPDATE variants 8283 SET "INFO" = 8284 concat( 8285 CASE 8286 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8287 THEN '' 8288 ELSE concat("INFO", ';') 8289 END, 8290 CASE 8291 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8292 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8293 THEN concat( 8294 '{snpeff_hgvs}=', 8295 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8296 ) 8297 ELSE '' 8298 END 8299 ) 8300 FROM dataframe_snpeff_hgvs 8301 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8302 8303 """ 8304 self.conn.execute(sql_update) 8305 8306 # Delete dataframe 8307 del dataframe_snpeff_hgvs 8308 gc.collect() 8309 8310 else: 8311 8312 log.warning( 8313 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8314 ) 8315 8316 # Remove added columns 8317 for added_column in added_columns: 8318 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
Parameters
- snpeff_hgvs: The
snpeff_hgvsparameter in thecalculation_extract_snpeff_hgvsfunction is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs - snpeff_field: The
snpeff_fieldparameter in thecalculation_extract_snpeff_hgvsfunction represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
8320 def calculation_snpeff_ann_explode( 8321 self, 8322 uniquify: bool = True, 8323 output_format: str = "fields", 8324 output_prefix: str = "snpeff_", 8325 snpeff_field: str = "ANN", 8326 ) -> None: 8327 """ 8328 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8329 exploding the HGVS field and updating variant information accordingly. 8330 8331 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8332 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8333 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8334 defaults to True 8335 :type uniquify: bool (optional) 8336 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8337 function specifies the format in which the output annotations will be generated. It has a 8338 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8339 format, defaults to fields 8340 :type output_format: str (optional) 8341 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8342 method is used to specify the prefix that will be added to the output annotations generated 8343 during the calculation process. This prefix helps to differentiate the newly added annotations 8344 from existing ones in the output data. By default, the, defaults to ANN_ 8345 :type output_prefix: str (optional) 8346 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8347 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8348 field will be processed to explode the HGVS annotations and update the variant information 8349 accordingly, defaults to ANN 8350 :type snpeff_field: str (optional) 8351 """ 8352 8353 # SnpEff annotation field 8354 snpeff_hgvs = "snpeff_ann_explode" 8355 8356 # Snpeff hgvs tags 8357 vcf_infos_tags = { 8358 snpeff_hgvs: "Explode snpEff annotations", 8359 } 8360 8361 # Prefix 8362 prefix = self.get_explode_infos_prefix() 8363 if prefix: 8364 prefix = "INFO/" 8365 8366 # snpEff fields 8367 speff_ann_infos = prefix + snpeff_field 8368 speff_hgvs_infos = prefix + snpeff_hgvs 8369 8370 # Variants table 8371 table_variants = self.get_table_variants() 8372 8373 # Header 8374 vcf_reader = self.get_header() 8375 8376 # Add columns 8377 added_columns = [] 8378 8379 # Explode HGVS field in column 8380 added_columns += self.explode_infos(fields=[snpeff_field]) 8381 log.debug(f"snpeff_field={snpeff_field}") 8382 log.debug(f"added_columns={added_columns}") 8383 8384 if snpeff_field in vcf_reader.infos: 8385 8386 # Extract ANN header 8387 ann_description = vcf_reader.infos[snpeff_field].desc 8388 pattern = r"'(.+?)'" 8389 match = re.search(pattern, ann_description) 8390 if match: 8391 ann_header_match = match.group(1).split(" | ") 8392 ann_header = [] 8393 ann_header_desc = {} 8394 for i in range(len(ann_header_match)): 8395 ann_header_info = "".join( 8396 char for char in ann_header_match[i] if char.isalnum() 8397 ) 8398 ann_header.append(ann_header_info) 8399 ann_header_desc[ann_header_info] = ann_header_match[i] 8400 if not ann_header_desc: 8401 raise ValueError("Invalid header description format") 8402 else: 8403 raise ValueError("Invalid header description format") 8404 8405 # Create variant id 8406 variant_id_column = self.get_variant_id_column() 8407 added_columns += [variant_id_column] 8408 8409 # Create dataframe 8410 dataframe_snpeff_hgvs = self.get_query_to_df( 8411 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8412 ) 8413 8414 # Create snpEff columns 8415 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8416 speff_ann_infos 8417 ].apply( 8418 lambda x: explode_snpeff_ann( 8419 str(x), 8420 uniquify=uniquify, 8421 output_format=output_format, 8422 prefix=output_prefix, 8423 header=list(ann_header_desc.values()), 8424 ) 8425 ) 8426 8427 # Header 8428 ann_annotations_prefix = "" 8429 if output_format.upper() in ["JSON"]: 8430 ann_annotations_prefix = f"{output_prefix}=" 8431 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8432 output_prefix, 8433 ".", 8434 "String", 8435 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8436 + " - JSON format", 8437 "howard calculation", 8438 "0", 8439 self.code_type_map.get("String"), 8440 ) 8441 else: 8442 for ann_annotation in ann_header: 8443 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8444 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8445 ann_annotation_id, 8446 ".", 8447 "String", 8448 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8449 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8450 "howard calculation", 8451 "0", 8452 self.code_type_map.get("String"), 8453 ) 8454 8455 # Update 8456 sql_update = f""" 8457 UPDATE variants 8458 SET "INFO" = 8459 concat( 8460 CASE 8461 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8462 THEN '' 8463 ELSE concat("INFO", ';') 8464 END, 8465 CASE 8466 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8467 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8468 THEN concat( 8469 '{ann_annotations_prefix}', 8470 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8471 ) 8472 ELSE '' 8473 END 8474 ) 8475 FROM dataframe_snpeff_hgvs 8476 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8477 8478 """ 8479 self.conn.execute(sql_update) 8480 8481 # Delete dataframe 8482 del dataframe_snpeff_hgvs 8483 gc.collect() 8484 8485 else: 8486 8487 log.warning( 8488 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8489 ) 8490 8491 # Remove added columns 8492 for added_column in added_columns: 8493 self.drop_column(column=added_column)
The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by
exploding the HGVS field and updating variant information accordingly.
Parameters
- uniquify: The
uniquifyparameter in thecalculation_snpeff_ann_explodemethod is a boolean flag that determines whether the output should be uniquified or not. When set toTrue, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True - output_format: The
output_formatparameter in thecalculation_snpeff_ann_explodefunction specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields - output_prefix: The
output_prefixparameter in thecalculation_snpeff_ann_explodemethod is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_ - snpeff_field: The
snpeff_fieldparameter in thecalculation_snpeff_ann_explodefunction is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
8495 def calculation_extract_nomen(self) -> None: 8496 """ 8497 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8498 """ 8499 8500 # NOMEN field 8501 field_nomen_dict = "NOMEN_DICT" 8502 8503 # NOMEN structure 8504 nomen_dict = { 8505 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8506 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8507 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8508 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8509 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8510 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8511 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8512 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8513 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8514 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8515 } 8516 8517 # Param 8518 param = self.get_param() 8519 8520 # Prefix 8521 prefix = self.get_explode_infos_prefix() 8522 8523 # Header 8524 vcf_reader = self.get_header() 8525 8526 # Get HGVS field 8527 hgvs_field = ( 8528 param.get("calculation", {}) 8529 .get("calculations", {}) 8530 .get("NOMEN", {}) 8531 .get("options", {}) 8532 .get("hgvs_field", "hgvs") 8533 ) 8534 8535 # Get transcripts 8536 transcripts_file = ( 8537 param.get("calculation", {}) 8538 .get("calculations", {}) 8539 .get("NOMEN", {}) 8540 .get("options", {}) 8541 .get("transcripts", None) 8542 ) 8543 transcripts_file = full_path(transcripts_file) 8544 transcripts = [] 8545 if transcripts_file: 8546 if os.path.exists(transcripts_file): 8547 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8548 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8549 else: 8550 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8551 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8552 8553 # Added columns 8554 added_columns = [] 8555 8556 # Explode HGVS field in column 8557 added_columns += self.explode_infos(fields=[hgvs_field]) 8558 8559 # extra infos 8560 extra_infos = self.get_extra_infos() 8561 extra_field = prefix + hgvs_field 8562 8563 if extra_field in extra_infos: 8564 8565 # Create dataframe 8566 dataframe_hgvs = self.get_query_to_df( 8567 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8568 ) 8569 8570 # Create main NOMEN column 8571 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8572 lambda x: find_nomen(str(x), transcripts=transcripts) 8573 ) 8574 8575 # Explode NOMEN Structure and create SQL set for update 8576 sql_nomen_fields = [] 8577 for nomen_field in nomen_dict: 8578 8579 # Explode each field into a column 8580 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8581 lambda x: dict(x).get(nomen_field, "") 8582 ) 8583 8584 # Create VCF header field 8585 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8586 nomen_field, 8587 ".", 8588 "String", 8589 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8590 "howard calculation", 8591 "0", 8592 self.code_type_map.get("String"), 8593 ) 8594 sql_nomen_fields.append( 8595 f""" 8596 CASE 8597 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8598 THEN concat( 8599 ';{nomen_field}=', 8600 dataframe_hgvs."{nomen_field}" 8601 ) 8602 ELSE '' 8603 END 8604 """ 8605 ) 8606 8607 # SQL set for update 8608 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8609 8610 # Update 8611 sql_update = f""" 8612 UPDATE variants 8613 SET "INFO" = 8614 concat( 8615 CASE 8616 WHEN "INFO" IS NULL 8617 THEN '' 8618 ELSE "INFO" 8619 END, 8620 {sql_nomen_fields_set} 8621 ) 8622 FROM dataframe_hgvs 8623 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8624 AND variants."POS" = dataframe_hgvs."POS" 8625 AND variants."REF" = dataframe_hgvs."REF" 8626 AND variants."ALT" = dataframe_hgvs."ALT" 8627 """ 8628 self.conn.execute(sql_update) 8629 8630 # Delete dataframe 8631 del dataframe_hgvs 8632 gc.collect() 8633 8634 # Remove added columns 8635 for added_column in added_columns: 8636 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8638 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8639 """ 8640 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8641 pipeline/sample for a variant and updates the variant information in a VCF file. 8642 8643 :param tag: The `tag` parameter is a string that represents the annotation field for the 8644 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8645 VCF header and to update the corresponding field in the variants table, defaults to 8646 findbypipeline 8647 :type tag: str (optional) 8648 """ 8649 8650 # if FORMAT and samples 8651 if ( 8652 "FORMAT" in self.get_header_columns_as_list() 8653 and self.get_header_sample_list() 8654 ): 8655 8656 # findbypipeline annotation field 8657 findbypipeline_tag = tag 8658 8659 # VCF infos tags 8660 vcf_infos_tags = { 8661 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8662 } 8663 8664 # Prefix 8665 prefix = self.get_explode_infos_prefix() 8666 8667 # Field 8668 findbypipeline_infos = prefix + findbypipeline_tag 8669 8670 # Variants table 8671 table_variants = self.get_table_variants() 8672 8673 # Header 8674 vcf_reader = self.get_header() 8675 8676 # Create variant id 8677 variant_id_column = self.get_variant_id_column() 8678 added_columns = [variant_id_column] 8679 8680 # variant_id, FORMAT and samples 8681 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8682 self.get_header_sample_list() 8683 ) 8684 8685 # Create dataframe 8686 dataframe_findbypipeline = self.get_query_to_df( 8687 f""" SELECT {samples_fields} FROM {table_variants} """ 8688 ) 8689 8690 # Create findbypipeline column 8691 dataframe_findbypipeline[findbypipeline_infos] = ( 8692 dataframe_findbypipeline.apply( 8693 lambda row: findbypipeline( 8694 row, samples=self.get_header_sample_list() 8695 ), 8696 axis=1, 8697 ) 8698 ) 8699 8700 # Add snpeff_hgvs to header 8701 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8702 findbypipeline_tag, 8703 ".", 8704 "String", 8705 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8706 "howard calculation", 8707 "0", 8708 self.code_type_map.get("String"), 8709 ) 8710 8711 # Update 8712 sql_update = f""" 8713 UPDATE variants 8714 SET "INFO" = 8715 concat( 8716 CASE 8717 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8718 THEN '' 8719 ELSE concat("INFO", ';') 8720 END, 8721 CASE 8722 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8723 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8724 THEN concat( 8725 '{findbypipeline_tag}=', 8726 dataframe_findbypipeline."{findbypipeline_infos}" 8727 ) 8728 ELSE '' 8729 END 8730 ) 8731 FROM dataframe_findbypipeline 8732 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8733 """ 8734 self.conn.execute(sql_update) 8735 8736 # Remove added columns 8737 for added_column in added_columns: 8738 self.drop_column(column=added_column) 8739 8740 # Delete dataframe 8741 del dataframe_findbypipeline 8742 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
8744 def calculation_genotype_concordance(self) -> None: 8745 """ 8746 The function `calculation_genotype_concordance` calculates the genotype concordance for 8747 multi-caller VCF files and updates the variant information in the database. 8748 """ 8749 8750 # if FORMAT and samples 8751 if ( 8752 "FORMAT" in self.get_header_columns_as_list() 8753 and self.get_header_sample_list() 8754 ): 8755 8756 # genotypeconcordance annotation field 8757 genotypeconcordance_tag = "genotypeconcordance" 8758 8759 # VCF infos tags 8760 vcf_infos_tags = { 8761 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8762 } 8763 8764 # Prefix 8765 prefix = self.get_explode_infos_prefix() 8766 8767 # Field 8768 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8769 8770 # Variants table 8771 table_variants = self.get_table_variants() 8772 8773 # Header 8774 vcf_reader = self.get_header() 8775 8776 # Create variant id 8777 variant_id_column = self.get_variant_id_column() 8778 added_columns = [variant_id_column] 8779 8780 # variant_id, FORMAT and samples 8781 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8782 self.get_header_sample_list() 8783 ) 8784 8785 # Create dataframe 8786 dataframe_genotypeconcordance = self.get_query_to_df( 8787 f""" SELECT {samples_fields} FROM {table_variants} """ 8788 ) 8789 8790 # Create genotypeconcordance column 8791 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8792 dataframe_genotypeconcordance.apply( 8793 lambda row: genotypeconcordance( 8794 row, samples=self.get_header_sample_list() 8795 ), 8796 axis=1, 8797 ) 8798 ) 8799 8800 # Add genotypeconcordance to header 8801 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8802 genotypeconcordance_tag, 8803 ".", 8804 "String", 8805 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8806 "howard calculation", 8807 "0", 8808 self.code_type_map.get("String"), 8809 ) 8810 8811 # Update 8812 sql_update = f""" 8813 UPDATE variants 8814 SET "INFO" = 8815 concat( 8816 CASE 8817 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8818 THEN '' 8819 ELSE concat("INFO", ';') 8820 END, 8821 CASE 8822 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8823 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8824 THEN concat( 8825 '{genotypeconcordance_tag}=', 8826 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8827 ) 8828 ELSE '' 8829 END 8830 ) 8831 FROM dataframe_genotypeconcordance 8832 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8833 """ 8834 self.conn.execute(sql_update) 8835 8836 # Remove added columns 8837 for added_column in added_columns: 8838 self.drop_column(column=added_column) 8839 8840 # Delete dataframe 8841 del dataframe_genotypeconcordance 8842 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
8844 def calculation_barcode(self, tag: str = "barcode") -> None: 8845 """ 8846 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8847 updates the INFO field in the file with the calculated barcode values. 8848 8849 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8850 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8851 the default tag name is set to "barcode", defaults to barcode 8852 :type tag: str (optional) 8853 """ 8854 8855 # if FORMAT and samples 8856 if ( 8857 "FORMAT" in self.get_header_columns_as_list() 8858 and self.get_header_sample_list() 8859 ): 8860 8861 # barcode annotation field 8862 if not tag: 8863 tag = "barcode" 8864 8865 # VCF infos tags 8866 vcf_infos_tags = { 8867 tag: "barcode calculation (VaRank)", 8868 } 8869 8870 # Prefix 8871 prefix = self.get_explode_infos_prefix() 8872 8873 # Field 8874 barcode_infos = prefix + tag 8875 8876 # Variants table 8877 table_variants = self.get_table_variants() 8878 8879 # Header 8880 vcf_reader = self.get_header() 8881 8882 # Create variant id 8883 variant_id_column = self.get_variant_id_column() 8884 added_columns = [variant_id_column] 8885 8886 # variant_id, FORMAT and samples 8887 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8888 self.get_header_sample_list() 8889 ) 8890 8891 # Create dataframe 8892 dataframe_barcode = self.get_query_to_df( 8893 f""" SELECT {samples_fields} FROM {table_variants} """ 8894 ) 8895 8896 # Create barcode column 8897 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8898 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8899 ) 8900 8901 # Add barcode to header 8902 vcf_reader.infos[tag] = vcf.parser._Info( 8903 tag, 8904 ".", 8905 "String", 8906 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8907 "howard calculation", 8908 "0", 8909 self.code_type_map.get("String"), 8910 ) 8911 8912 # Update 8913 sql_update = f""" 8914 UPDATE {table_variants} 8915 SET "INFO" = 8916 concat( 8917 CASE 8918 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8919 THEN '' 8920 ELSE concat("INFO", ';') 8921 END, 8922 CASE 8923 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8924 AND dataframe_barcode."{barcode_infos}" NOT NULL 8925 THEN concat( 8926 '{tag}=', 8927 dataframe_barcode."{barcode_infos}" 8928 ) 8929 ELSE '' 8930 END 8931 ) 8932 FROM dataframe_barcode 8933 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8934 """ 8935 self.conn.execute(sql_update) 8936 8937 # Remove added columns 8938 for added_column in added_columns: 8939 self.drop_column(column=added_column) 8940 8941 # Delete dataframe 8942 del dataframe_barcode 8943 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcodefunction is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
8945 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8946 """ 8947 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8948 and updates the INFO field in the file with the calculated barcode values. 8949 8950 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8951 the barcode tag that will be added to the VCF file during the calculation process. If no value 8952 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8953 :type tag: str (optional) 8954 """ 8955 8956 # if FORMAT and samples 8957 if ( 8958 "FORMAT" in self.get_header_columns_as_list() 8959 and self.get_header_sample_list() 8960 ): 8961 8962 # barcode annotation field 8963 if not tag: 8964 tag = "BCF" 8965 8966 # VCF infos tags 8967 vcf_infos_tags = { 8968 tag: "barcode family calculation", 8969 f"{tag}S": "barcode family samples", 8970 } 8971 8972 # Param 8973 param = self.get_param() 8974 log.debug(f"param={param}") 8975 8976 # Prefix 8977 prefix = self.get_explode_infos_prefix() 8978 8979 # PED param 8980 ped = ( 8981 param.get("calculation", {}) 8982 .get("calculations", {}) 8983 .get("BARCODEFAMILY", {}) 8984 .get("family_pedigree", None) 8985 ) 8986 log.debug(f"ped={ped}") 8987 8988 # Load PED 8989 if ped: 8990 8991 # Pedigree is a file 8992 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8993 log.debug("Pedigree is file") 8994 with open(full_path(ped)) as ped: 8995 ped = json.load(ped) 8996 8997 # Pedigree is a string 8998 elif isinstance(ped, str): 8999 log.debug("Pedigree is str") 9000 try: 9001 ped = json.loads(ped) 9002 log.debug("Pedigree is json str") 9003 except ValueError as e: 9004 ped_samples = ped.split(",") 9005 ped = {} 9006 for ped_sample in ped_samples: 9007 ped[ped_sample] = ped_sample 9008 9009 # Pedigree is a dict 9010 elif isinstance(ped, dict): 9011 log.debug("Pedigree is dict") 9012 9013 # Pedigree is not well formatted 9014 else: 9015 msg_error = "Pedigree not well formatted" 9016 log.error(msg_error) 9017 raise ValueError(msg_error) 9018 9019 # Construct list 9020 ped_samples = list(ped.values()) 9021 9022 else: 9023 log.debug("Pedigree not defined. Take all samples") 9024 ped_samples = self.get_header_sample_list() 9025 ped = {} 9026 for ped_sample in ped_samples: 9027 ped[ped_sample] = ped_sample 9028 9029 # Check pedigree 9030 if not ped or len(ped) == 0: 9031 msg_error = f"Error in pedigree: samples {ped_samples}" 9032 log.error(msg_error) 9033 raise ValueError(msg_error) 9034 9035 # Log 9036 log.info( 9037 "Calculation 'BARCODEFAMILY' - Samples: " 9038 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9039 ) 9040 log.debug(f"ped_samples={ped_samples}") 9041 9042 # Field 9043 barcode_infos = prefix + tag 9044 9045 # Variants table 9046 table_variants = self.get_table_variants() 9047 9048 # Header 9049 vcf_reader = self.get_header() 9050 9051 # Create variant id 9052 variant_id_column = self.get_variant_id_column() 9053 added_columns = [variant_id_column] 9054 9055 # variant_id, FORMAT and samples 9056 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9057 ped_samples 9058 ) 9059 9060 # Create dataframe 9061 dataframe_barcode = self.get_query_to_df( 9062 f""" SELECT {samples_fields} FROM {table_variants} """ 9063 ) 9064 9065 # Create barcode column 9066 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9067 lambda row: barcode(row, samples=ped_samples), axis=1 9068 ) 9069 9070 # Add barcode family to header 9071 # Add vaf_normalization to header 9072 vcf_reader.formats[tag] = vcf.parser._Format( 9073 id=tag, 9074 num=".", 9075 type="String", 9076 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9077 type_code=self.code_type_map.get("String"), 9078 ) 9079 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9080 id=f"{tag}S", 9081 num=".", 9082 type="String", 9083 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9084 type_code=self.code_type_map.get("String"), 9085 ) 9086 9087 # Update 9088 # for sample in ped_samples: 9089 sql_update_set = [] 9090 for sample in self.get_header_sample_list() + ["FORMAT"]: 9091 if sample in ped_samples: 9092 value = f'dataframe_barcode."{barcode_infos}"' 9093 value_samples = "'" + ",".join(ped_samples) + "'" 9094 elif sample == "FORMAT": 9095 value = f"'{tag}'" 9096 value_samples = f"'{tag}S'" 9097 else: 9098 value = "'.'" 9099 value_samples = "'.'" 9100 format_regex = r"[a-zA-Z0-9\s]" 9101 sql_update_set.append( 9102 f""" 9103 "{sample}" = 9104 concat( 9105 CASE 9106 WHEN {table_variants}."{sample}" = './.' 9107 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9108 ELSE {table_variants}."{sample}" 9109 END, 9110 ':', 9111 {value}, 9112 ':', 9113 {value_samples} 9114 ) 9115 """ 9116 ) 9117 9118 sql_update_set_join = ", ".join(sql_update_set) 9119 sql_update = f""" 9120 UPDATE {table_variants} 9121 SET {sql_update_set_join} 9122 FROM dataframe_barcode 9123 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9124 """ 9125 self.conn.execute(sql_update) 9126 9127 # Remove added columns 9128 for added_column in added_columns: 9129 self.drop_column(column=added_column) 9130 9131 # Delete dataframe 9132 del dataframe_barcode 9133 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
9135 def calculation_trio(self) -> None: 9136 """ 9137 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9138 information to the INFO field of each variant. 9139 """ 9140 9141 # if FORMAT and samples 9142 if ( 9143 "FORMAT" in self.get_header_columns_as_list() 9144 and self.get_header_sample_list() 9145 ): 9146 9147 # trio annotation field 9148 trio_tag = "trio" 9149 9150 # VCF infos tags 9151 vcf_infos_tags = { 9152 "trio": "trio calculation", 9153 } 9154 9155 # Param 9156 param = self.get_param() 9157 9158 # Prefix 9159 prefix = self.get_explode_infos_prefix() 9160 9161 # Trio param 9162 trio_ped = ( 9163 param.get("calculation", {}) 9164 .get("calculations", {}) 9165 .get("TRIO", {}) 9166 .get("trio_pedigree", None) 9167 ) 9168 9169 # Load trio 9170 if trio_ped: 9171 9172 # Trio pedigree is a file 9173 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9174 log.debug("TRIO pedigree is file") 9175 with open(full_path(trio_ped)) as trio_ped: 9176 trio_ped = json.load(trio_ped) 9177 9178 # Trio pedigree is a string 9179 elif isinstance(trio_ped, str): 9180 log.debug("TRIO pedigree is str") 9181 try: 9182 trio_ped = json.loads(trio_ped) 9183 log.debug("TRIO pedigree is json str") 9184 except ValueError as e: 9185 trio_samples = trio_ped.split(",") 9186 if len(trio_samples) == 3: 9187 trio_ped = { 9188 "father": trio_samples[0], 9189 "mother": trio_samples[1], 9190 "child": trio_samples[2], 9191 } 9192 log.debug("TRIO pedigree is list str") 9193 else: 9194 msg_error = "TRIO pedigree not well formatted" 9195 log.error(msg_error) 9196 raise ValueError(msg_error) 9197 9198 # Trio pedigree is a dict 9199 elif isinstance(trio_ped, dict): 9200 log.debug("TRIO pedigree is dict") 9201 9202 # Trio pedigree is not well formatted 9203 else: 9204 msg_error = "TRIO pedigree not well formatted" 9205 log.error(msg_error) 9206 raise ValueError(msg_error) 9207 9208 # Construct trio list 9209 trio_samples = [ 9210 trio_ped.get("father", ""), 9211 trio_ped.get("mother", ""), 9212 trio_ped.get("child", ""), 9213 ] 9214 9215 else: 9216 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9217 samples_list = self.get_header_sample_list() 9218 if len(samples_list) >= 3: 9219 trio_samples = self.get_header_sample_list()[0:3] 9220 trio_ped = { 9221 "father": trio_samples[0], 9222 "mother": trio_samples[1], 9223 "child": trio_samples[2], 9224 } 9225 else: 9226 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9227 log.error(msg_error) 9228 raise ValueError(msg_error) 9229 9230 # Check trio pedigree 9231 if not trio_ped or len(trio_ped) != 3: 9232 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9233 log.error(msg_error) 9234 raise ValueError(msg_error) 9235 9236 # Log 9237 log.info( 9238 f"Calculation 'TRIO' - Samples: " 9239 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9240 ) 9241 9242 # Field 9243 trio_infos = prefix + trio_tag 9244 9245 # Variants table 9246 table_variants = self.get_table_variants() 9247 9248 # Header 9249 vcf_reader = self.get_header() 9250 9251 # Create variant id 9252 variant_id_column = self.get_variant_id_column() 9253 added_columns = [variant_id_column] 9254 9255 # variant_id, FORMAT and samples 9256 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9257 self.get_header_sample_list() 9258 ) 9259 9260 # Create dataframe 9261 dataframe_trio = self.get_query_to_df( 9262 f""" SELECT {samples_fields} FROM {table_variants} """ 9263 ) 9264 9265 # Create trio column 9266 dataframe_trio[trio_infos] = dataframe_trio.apply( 9267 lambda row: trio(row, samples=trio_samples), axis=1 9268 ) 9269 9270 # Add trio to header 9271 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9272 trio_tag, 9273 ".", 9274 "String", 9275 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9276 "howard calculation", 9277 "0", 9278 self.code_type_map.get("String"), 9279 ) 9280 9281 # Update 9282 sql_update = f""" 9283 UPDATE {table_variants} 9284 SET "INFO" = 9285 concat( 9286 CASE 9287 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9288 THEN '' 9289 ELSE concat("INFO", ';') 9290 END, 9291 CASE 9292 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9293 AND dataframe_trio."{trio_infos}" NOT NULL 9294 THEN concat( 9295 '{trio_tag}=', 9296 dataframe_trio."{trio_infos}" 9297 ) 9298 ELSE '' 9299 END 9300 ) 9301 FROM dataframe_trio 9302 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9303 """ 9304 self.conn.execute(sql_update) 9305 9306 # Remove added columns 9307 for added_column in added_columns: 9308 self.drop_column(column=added_column) 9309 9310 # Delete dataframe 9311 del dataframe_trio 9312 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
9314 def calculation_vaf_normalization(self) -> None: 9315 """ 9316 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9317 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9318 :return: The function does not return anything. 9319 """ 9320 9321 # if FORMAT and samples 9322 if ( 9323 "FORMAT" in self.get_header_columns_as_list() 9324 and self.get_header_sample_list() 9325 ): 9326 9327 # vaf_normalization annotation field 9328 vaf_normalization_tag = "VAF" 9329 9330 # VCF infos tags 9331 vcf_infos_tags = { 9332 "VAF": "VAF Variant Frequency", 9333 } 9334 9335 # Prefix 9336 prefix = self.get_explode_infos_prefix() 9337 9338 # Variants table 9339 table_variants = self.get_table_variants() 9340 9341 # Header 9342 vcf_reader = self.get_header() 9343 9344 # Do not calculate if VAF already exists 9345 if "VAF" in vcf_reader.formats: 9346 log.debug("VAF already on genotypes") 9347 return 9348 9349 # Create variant id 9350 variant_id_column = self.get_variant_id_column() 9351 added_columns = [variant_id_column] 9352 9353 # variant_id, FORMAT and samples 9354 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9355 f""" "{sample}" """ for sample in self.get_header_sample_list() 9356 ) 9357 9358 # Create dataframe 9359 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9360 log.debug(f"query={query}") 9361 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9362 9363 vaf_normalization_set = [] 9364 9365 # for each sample vaf_normalization 9366 for sample in self.get_header_sample_list(): 9367 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9368 lambda row: vaf_normalization(row, sample=sample), axis=1 9369 ) 9370 vaf_normalization_set.append( 9371 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9372 ) 9373 9374 # Add VAF to FORMAT 9375 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9376 "FORMAT" 9377 ].apply(lambda x: str(x) + ":VAF") 9378 vaf_normalization_set.append( 9379 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9380 ) 9381 9382 # Add vaf_normalization to header 9383 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9384 id=vaf_normalization_tag, 9385 num="1", 9386 type="Float", 9387 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9388 type_code=self.code_type_map.get("Float"), 9389 ) 9390 9391 # Create fields to add in INFO 9392 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9393 9394 # Update 9395 sql_update = f""" 9396 UPDATE {table_variants} 9397 SET {sql_vaf_normalization_set} 9398 FROM dataframe_vaf_normalization 9399 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9400 9401 """ 9402 self.conn.execute(sql_update) 9403 9404 # Remove added columns 9405 for added_column in added_columns: 9406 self.drop_column(column=added_column) 9407 9408 # Delete dataframe 9409 del dataframe_vaf_normalization 9410 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
9412 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9413 """ 9414 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9415 field in a VCF file and updates the INFO column of the variants table with the calculated 9416 statistics. 9417 9418 :param info: The `info` parameter is a string that represents the type of information for which 9419 genotype statistics are calculated. It is used to generate various VCF info tags for the 9420 statistics, such as the number of occurrences, the list of values, the minimum value, the 9421 maximum value, the mean, the median, defaults to VAF 9422 :type info: str (optional) 9423 """ 9424 9425 # if FORMAT and samples 9426 if ( 9427 "FORMAT" in self.get_header_columns_as_list() 9428 and self.get_header_sample_list() 9429 ): 9430 9431 # vaf_stats annotation field 9432 vaf_stats_tag = info + "_stats" 9433 9434 # VCF infos tags 9435 vcf_infos_tags = { 9436 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9437 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9438 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9439 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9440 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9441 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9442 info 9443 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9444 } 9445 9446 # Prefix 9447 prefix = self.get_explode_infos_prefix() 9448 9449 # Field 9450 vaf_stats_infos = prefix + vaf_stats_tag 9451 9452 # Variants table 9453 table_variants = self.get_table_variants() 9454 9455 # Header 9456 vcf_reader = self.get_header() 9457 9458 # Create variant id 9459 variant_id_column = self.get_variant_id_column() 9460 added_columns = [variant_id_column] 9461 9462 # variant_id, FORMAT and samples 9463 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9464 self.get_header_sample_list() 9465 ) 9466 9467 # Create dataframe 9468 dataframe_vaf_stats = self.get_query_to_df( 9469 f""" SELECT {samples_fields} FROM {table_variants} """ 9470 ) 9471 9472 # Create vaf_stats column 9473 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9474 lambda row: genotype_stats( 9475 row, samples=self.get_header_sample_list(), info=info 9476 ), 9477 axis=1, 9478 ) 9479 9480 # List of vcf tags 9481 sql_vaf_stats_fields = [] 9482 9483 # Check all VAF stats infos 9484 for stat in vcf_infos_tags: 9485 9486 # Extract stats 9487 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9488 lambda x: dict(x).get(stat, "") 9489 ) 9490 9491 # Add snpeff_hgvs to header 9492 vcf_reader.infos[stat] = vcf.parser._Info( 9493 stat, 9494 ".", 9495 "String", 9496 vcf_infos_tags.get(stat, "genotype statistics"), 9497 "howard calculation", 9498 "0", 9499 self.code_type_map.get("String"), 9500 ) 9501 9502 if len(sql_vaf_stats_fields): 9503 sep = ";" 9504 else: 9505 sep = "" 9506 9507 # Create fields to add in INFO 9508 sql_vaf_stats_fields.append( 9509 f""" 9510 CASE 9511 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9512 THEN concat( 9513 '{sep}{stat}=', 9514 dataframe_vaf_stats."{stat}" 9515 ) 9516 ELSE '' 9517 END 9518 """ 9519 ) 9520 9521 # SQL set for update 9522 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9523 9524 # Update 9525 sql_update = f""" 9526 UPDATE {table_variants} 9527 SET "INFO" = 9528 concat( 9529 CASE 9530 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9531 THEN '' 9532 ELSE concat("INFO", ';') 9533 END, 9534 {sql_vaf_stats_fields_set} 9535 ) 9536 FROM dataframe_vaf_stats 9537 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9538 9539 """ 9540 self.conn.execute(sql_update) 9541 9542 # Remove added columns 9543 for added_column in added_columns: 9544 self.drop_column(column=added_column) 9545 9546 # Delete dataframe 9547 del dataframe_vaf_stats 9548 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
9550 def calculation_transcripts_annotation( 9551 self, info_json: str = None, info_format: str = None 9552 ) -> None: 9553 """ 9554 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9555 field to it if transcripts are available. 9556 9557 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9558 is a string parameter that represents the information field to be used in the transcripts JSON. 9559 It is used to specify the JSON format for the transcripts information. If no value is provided 9560 when calling the method, it defaults to " 9561 :type info_json: str 9562 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9563 method is a string parameter that specifies the format of the information field to be used in 9564 the transcripts JSON. It is used to define the format of the information field 9565 :type info_format: str 9566 """ 9567 9568 # Create transcripts table 9569 transcripts_table = self.create_transcript_view() 9570 9571 # Add info field 9572 if transcripts_table: 9573 self.transcript_view_to_variants( 9574 transcripts_table=transcripts_table, 9575 transcripts_info_field_json=info_json, 9576 transcripts_info_field_format=info_format, 9577 ) 9578 else: 9579 log.info("No Transcripts to process. Check param.json file configuration")
The calculation_transcripts_annotation function creates a transcripts table and adds an info
field to it if transcripts are available.
Parameters
- info_json: The
info_jsonparameter in thecalculation_transcripts_annotationmethod is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to " - info_format: The
info_formatparameter in thecalculation_transcripts_annotationmethod is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
9581 def calculation_transcripts_prioritization(self) -> None: 9582 """ 9583 The function `calculation_transcripts_prioritization` creates a transcripts table and 9584 prioritizes transcripts based on certain criteria. 9585 """ 9586 9587 # Create transcripts table 9588 transcripts_table = self.create_transcript_view() 9589 9590 # Add info field 9591 if transcripts_table: 9592 self.transcripts_prioritization(transcripts_table=transcripts_table) 9593 else: 9594 log.info("No Transcripts to process. Check param.json file configuration")
The function calculation_transcripts_prioritization creates a transcripts table and
prioritizes transcripts based on certain criteria.
9600 def transcripts_prioritization( 9601 self, transcripts_table: str = None, param: dict = {} 9602 ) -> bool: 9603 """ 9604 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9605 and updates the variants table with the prioritized information. 9606 9607 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9608 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9609 This parameter is used to identify the table where the transcripts data is stored for the 9610 prioritization process 9611 :type transcripts_table: str 9612 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9613 that contains various configuration settings for the prioritization process of transcripts. It 9614 is used to customize the behavior of the prioritization algorithm and includes settings such as 9615 the prefix for prioritization fields, default profiles, and other 9616 :type param: dict 9617 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9618 transcripts prioritization process is successfully completed, and `False` if there are any 9619 issues or if no profile is defined for transcripts prioritization. 9620 """ 9621 9622 log.debug("Start transcripts prioritization...") 9623 9624 # Param 9625 if not param: 9626 param = self.get_param() 9627 9628 # Variants table 9629 table_variants = self.get_table_variants() 9630 log.debug(f"transcripts_table={transcripts_table}") 9631 # Transcripts table 9632 if transcripts_table is None: 9633 log.debug(f"transcripts_table={transcripts_table}") 9634 transcripts_table = self.create_transcript_view( 9635 transcripts_table="transcripts", param=param 9636 ) 9637 log.debug(f"transcripts_table={transcripts_table}") 9638 if transcripts_table is None: 9639 msg_err = "No Transcripts table availalble" 9640 log.error(msg_err) 9641 raise ValueError(msg_err) 9642 9643 # Get transcripts columns 9644 columns_as_list_query = f""" 9645 DESCRIBE {transcripts_table} 9646 """ 9647 columns_as_list = list( 9648 self.get_query_to_df(columns_as_list_query)["column_name"] 9649 ) 9650 9651 # Create INFO if not exists 9652 if "INFO" not in columns_as_list: 9653 query_add_info = f""" 9654 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9655 """ 9656 self.execute_query(query_add_info) 9657 9658 # Prioritization param and Force only PZ Score and Flag 9659 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9660 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9661 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9662 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9663 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9664 pz_profile_default = ( 9665 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9666 ) 9667 9668 # Exit if no profile 9669 if pz_profile_default is None: 9670 log.warning("No profile defined for transcripts prioritization") 9671 return False 9672 9673 # Prioritization 9674 prioritization_result = self.prioritization( 9675 table=transcripts_table, 9676 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9677 ) 9678 if not prioritization_result: 9679 log.warning("Transcripts prioritization not processed") 9680 return False 9681 9682 # Explode PZ fields 9683 self.explode_infos( 9684 table=transcripts_table, 9685 fields=param.get("transcripts", {}) 9686 .get("prioritization", {}) 9687 .get("pzfields", []), 9688 ) 9689 9690 # Export Transcripts prioritization infos to variants table 9691 query_update = f""" 9692 WITH RankedTranscripts AS ( 9693 SELECT 9694 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9695 ROW_NUMBER() OVER ( 9696 PARTITION BY "#CHROM", POS, REF, ALT 9697 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9698 ) AS rn 9699 FROM 9700 {transcripts_table} 9701 ) 9702 UPDATE {table_variants} 9703 SET 9704 INFO = CONCAT(CASE 9705 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9706 THEN '' 9707 ELSE concat("INFO", ';') 9708 END, 9709 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9710 ) 9711 FROM 9712 RankedTranscripts 9713 WHERE 9714 rn = 1 9715 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9716 AND variants."POS" = RankedTranscripts."POS" 9717 AND variants."REF" = RankedTranscripts."REF" 9718 AND variants."ALT" = RankedTranscripts."ALT" 9719 9720 """ 9721 self.execute_query(query=query_update) 9722 9723 # Add PZ Transcript in header 9724 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9725 pz_fields_transcripts, 9726 ".", 9727 "String", 9728 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9729 "unknown", 9730 "unknown", 9731 code_type_map["String"], 9732 ) 9733 9734 # Return 9735 return True
The transcripts_prioritization function prioritizes transcripts based on certain parameters
and updates the variants table with the prioritized information.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process - param: The
paramparameter in thetranscripts_prioritizationmethod is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns
The function
transcripts_prioritizationreturns a boolean valueTrueif the transcripts prioritization process is successfully completed, andFalseif there are any issues or if no profile is defined for transcripts prioritization.
9737 def create_transcript_view_from_columns_map( 9738 self, 9739 transcripts_table: str = "transcripts", 9740 columns_maps: dict = {}, 9741 added_columns: list = [], 9742 temporary_tables: list = None, 9743 annotation_fields: list = None, 9744 ) -> tuple[list, list, list]: 9745 """ 9746 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9747 specified columns mapping for transcripts data. 9748 9749 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9750 the table where the transcripts data is stored or will be stored in the database. This table 9751 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9752 predictions, etc. It defaults to "transcripts, defaults to transcripts 9753 :type transcripts_table: str (optional) 9754 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9755 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9756 represents a mapping configuration for a specific set of columns. It typically includes details such 9757 as the main transcript column and additional information columns 9758 :type columns_maps: dict 9759 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9760 function is a list that stores the additional columns that will be added to the view being created 9761 based on the columns map provided. These columns are generated by exploding the transcript 9762 information columns along with the main transcript column 9763 :type added_columns: list 9764 :param temporary_tables: The `temporary_tables` parameter in the 9765 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9766 tables created during the process of creating a transcript view from a columns map. These temporary 9767 tables are used to store intermediate results or transformations before the final view is generated 9768 :type temporary_tables: list 9769 :param annotation_fields: The `annotation_fields` parameter in the 9770 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9771 for annotation in the query view creation process. These fields are extracted from the 9772 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9773 :type annotation_fields: list 9774 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9775 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9776 """ 9777 9778 log.debug("Start transcrpts view creation from columns map...") 9779 9780 # "from_columns_map": [ 9781 # { 9782 # "transcripts_column": "Ensembl_transcriptid", 9783 # "transcripts_infos_columns": [ 9784 # "genename", 9785 # "Ensembl_geneid", 9786 # "LIST_S2_score", 9787 # "LIST_S2_pred", 9788 # ], 9789 # }, 9790 # { 9791 # "transcripts_column": "Ensembl_transcriptid", 9792 # "transcripts_infos_columns": [ 9793 # "genename", 9794 # "VARITY_R_score", 9795 # "Aloft_pred", 9796 # ], 9797 # }, 9798 # ], 9799 9800 # Init 9801 if temporary_tables is None: 9802 temporary_tables = [] 9803 if annotation_fields is None: 9804 annotation_fields = [] 9805 9806 # Variants table 9807 table_variants = self.get_table_variants() 9808 9809 for columns_map in columns_maps: 9810 9811 # Transcript column 9812 transcripts_column = columns_map.get("transcripts_column", None) 9813 9814 # Transcripts infos columns 9815 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9816 9817 if transcripts_column is not None: 9818 9819 # Explode 9820 added_columns += self.explode_infos( 9821 fields=[transcripts_column] + transcripts_infos_columns 9822 ) 9823 9824 # View clauses 9825 clause_select = [] 9826 for field in [transcripts_column] + transcripts_infos_columns: 9827 clause_select.append( 9828 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9829 ) 9830 if field not in [transcripts_column]: 9831 annotation_fields.append(field) 9832 9833 # Querey View 9834 query = f""" 9835 SELECT 9836 "#CHROM", POS, REF, ALT, INFO, 9837 "{transcripts_column}" AS 'transcript', 9838 {", ".join(clause_select)} 9839 FROM ( 9840 SELECT 9841 "#CHROM", POS, REF, ALT, INFO, 9842 {", ".join(clause_select)} 9843 FROM {table_variants} 9844 ) 9845 WHERE "{transcripts_column}" IS NOT NULL 9846 """ 9847 9848 # Create temporary table 9849 temporary_table = transcripts_table + "".join( 9850 random.choices(string.ascii_uppercase + string.digits, k=10) 9851 ) 9852 9853 # Temporary_tables 9854 temporary_tables.append(temporary_table) 9855 query_view = f""" 9856 CREATE TEMPORARY TABLE {temporary_table} 9857 AS ({query}) 9858 """ 9859 self.execute_query(query=query_view) 9860 9861 return added_columns, temporary_tables, annotation_fields
The create_transcript_view_from_columns_map function generates a temporary table view based on
specified columns mapping for transcripts data.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts - columns_maps: The
columns_mapsparameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in thecolumns_mapslist represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns - added_columns: The
added_columnsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from thetranscripts_columnandtranscripts_infos_columnsspecified in the `columns
Returns
The function
create_transcript_view_from_columns_mapreturns a tuple containing three lists:added_columns,temporary_tables, andannotation_fields.
9863 def create_transcript_view_from_column_format( 9864 self, 9865 transcripts_table: str = "transcripts", 9866 column_formats: dict = {}, 9867 temporary_tables: list = None, 9868 annotation_fields: list = None, 9869 ) -> tuple[list, list, list]: 9870 """ 9871 The `create_transcript_view_from_column_format` function generates a transcript view based on 9872 specified column formats, adds additional columns and annotation fields, and returns the list of 9873 temporary tables and annotation fields. 9874 9875 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9876 the table containing the transcripts data. This table will be used as the base table for creating 9877 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9878 different table name if needed, defaults to transcripts 9879 :type transcripts_table: str (optional) 9880 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9881 about the columns to be used for creating the transcript view. Each entry in the dictionary 9882 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9883 the provided code snippet: 9884 :type column_formats: dict 9885 :param temporary_tables: The `temporary_tables` parameter in the 9886 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9887 views created during the process of creating a transcript view from a column format. These temporary 9888 views are used to manipulate and extract data before generating the final transcript view. It 9889 :type temporary_tables: list 9890 :param annotation_fields: The `annotation_fields` parameter in the 9891 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9892 that are extracted from the temporary views created during the process. These annotation fields are 9893 obtained by querying the temporary views and extracting the column names excluding specific columns 9894 like `#CH 9895 :type annotation_fields: list 9896 :return: The `create_transcript_view_from_column_format` function returns two lists: 9897 `temporary_tables` and `annotation_fields`. 9898 """ 9899 9900 log.debug("Start transcrpts view creation from column format...") 9901 9902 # "from_column_format": [ 9903 # { 9904 # "transcripts_column": "ANN", 9905 # "transcripts_infos_column": "Feature_ID", 9906 # } 9907 # ], 9908 9909 # Init 9910 if temporary_tables is None: 9911 temporary_tables = [] 9912 if annotation_fields is None: 9913 annotation_fields = [] 9914 9915 for column_format in column_formats: 9916 9917 # annotation field and transcript annotation field 9918 annotation_field = column_format.get("transcripts_column", "ANN") 9919 transcript_annotation = column_format.get( 9920 "transcripts_infos_column", "Feature_ID" 9921 ) 9922 9923 # Temporary View name 9924 temporary_view_name = transcripts_table + "".join( 9925 random.choices(string.ascii_uppercase + string.digits, k=10) 9926 ) 9927 9928 # Create temporary view name 9929 temporary_view_name = self.annotation_format_to_table( 9930 uniquify=True, 9931 annotation_field=annotation_field, 9932 view_name=temporary_view_name, 9933 annotation_id=transcript_annotation, 9934 ) 9935 9936 # Annotation fields 9937 if temporary_view_name: 9938 query_annotation_fields = f""" 9939 SELECT * 9940 FROM ( 9941 DESCRIBE SELECT * 9942 FROM {temporary_view_name} 9943 ) 9944 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9945 """ 9946 df_annotation_fields = self.get_query_to_df( 9947 query=query_annotation_fields 9948 ) 9949 9950 # Add temporary view and annotation fields 9951 temporary_tables.append(temporary_view_name) 9952 annotation_fields += list(set(df_annotation_fields["column_name"])) 9953 9954 return temporary_tables, annotation_fields
The create_transcript_view_from_column_format function generates a transcript view based on
specified column formats, adds additional columns and annotation fields, and returns the list of
temporary tables and annotation fields.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts - column_formats: The
column_formatsparameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. For example, in the provided code snippet: - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view. It - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
Returns
The
create_transcript_view_from_column_formatfunction returns two lists:temporary_tablesandannotation_fields.
9956 def create_transcript_view( 9957 self, 9958 transcripts_table: str = None, 9959 transcripts_table_drop: bool = True, 9960 param: dict = {}, 9961 ) -> str: 9962 """ 9963 The `create_transcript_view` function generates a transcript view by processing data from a 9964 specified table based on provided parameters and structural information. 9965 9966 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9967 is used to specify the name of the table that will store the final transcript view data. If a table 9968 name is not provided, the function will create a new table to store the transcript view data, and by 9969 default,, defaults to transcripts 9970 :type transcripts_table: str (optional) 9971 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9972 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9973 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9974 the function will drop the existing transcripts table if it exists, defaults to True 9975 :type transcripts_table_drop: bool (optional) 9976 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9977 contains information needed to create a transcript view. It includes details such as the structure 9978 of the transcripts, columns mapping, column formats, and other necessary information for generating 9979 the view. This parameter allows for flexibility and customization 9980 :type param: dict 9981 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9982 created or modified during the execution of the function. 9983 """ 9984 9985 log.debug("Start transcripts view creation...") 9986 9987 # Default 9988 transcripts_table_default = "transcripts" 9989 9990 # Param 9991 if not param: 9992 param = self.get_param() 9993 9994 # Struct 9995 struct = param.get("transcripts", {}).get("struct", None) 9996 9997 if struct: 9998 9999 # Transcripts table 10000 if transcripts_table is None: 10001 transcripts_table = param.get("transcripts", {}).get( 10002 "table", transcripts_table_default 10003 ) 10004 10005 # added_columns 10006 added_columns = [] 10007 10008 # Temporary tables 10009 temporary_tables = [] 10010 10011 # Annotation fields 10012 annotation_fields = [] 10013 10014 # from columns map 10015 columns_maps = struct.get("from_columns_map", []) 10016 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10017 self.create_transcript_view_from_columns_map( 10018 transcripts_table=transcripts_table, 10019 columns_maps=columns_maps, 10020 added_columns=added_columns, 10021 temporary_tables=temporary_tables, 10022 annotation_fields=annotation_fields, 10023 ) 10024 ) 10025 added_columns += added_columns_tmp 10026 temporary_tables += temporary_tables_tmp 10027 annotation_fields += annotation_fields_tmp 10028 10029 # from column format 10030 column_formats = struct.get("from_column_format", []) 10031 temporary_tables_tmp, annotation_fields_tmp = ( 10032 self.create_transcript_view_from_column_format( 10033 transcripts_table=transcripts_table, 10034 column_formats=column_formats, 10035 temporary_tables=temporary_tables, 10036 annotation_fields=annotation_fields, 10037 ) 10038 ) 10039 temporary_tables += temporary_tables_tmp 10040 annotation_fields += annotation_fields_tmp 10041 10042 # Merge temporary tables query 10043 query_merge = "" 10044 for temporary_table in temporary_tables: 10045 10046 # First temporary table 10047 if not query_merge: 10048 query_merge = f""" 10049 SELECT * FROM {temporary_table} 10050 """ 10051 # other temporary table (using UNION) 10052 else: 10053 query_merge += f""" 10054 UNION BY NAME SELECT * FROM {temporary_table} 10055 """ 10056 10057 # Merge on transcript 10058 query_merge_on_transcripts_annotation_fields = [] 10059 # Aggregate all annotations fields 10060 for annotation_field in set(annotation_fields): 10061 query_merge_on_transcripts_annotation_fields.append( 10062 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10063 ) 10064 # Query for transcripts view 10065 query_merge_on_transcripts = f""" 10066 SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 10067 FROM ({query_merge}) 10068 GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript 10069 """ 10070 10071 # Drop transcript view is necessary 10072 if transcripts_table_drop: 10073 query_drop = f""" 10074 DROP TABLE IF EXISTS {transcripts_table}; 10075 """ 10076 self.execute_query(query=query_drop) 10077 10078 # Merge and create transcript view 10079 query_create_view = f""" 10080 CREATE TABLE IF NOT EXISTS {transcripts_table} 10081 AS {query_merge_on_transcripts} 10082 """ 10083 self.execute_query(query=query_create_view) 10084 10085 # Remove added columns 10086 for added_column in added_columns: 10087 self.drop_column(column=added_column) 10088 10089 else: 10090 10091 transcripts_table = None 10092 10093 return transcripts_table
The create_transcript_view function generates a transcript view by processing data from a
specified table based on provided parameters and structural information.
Parameters
- transcripts_table: The
transcripts_tableparameter in thecreate_transcript_viewfunction is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts - transcripts_table_drop: The
transcripts_table_dropparameter in thecreate_transcript_viewfunction is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. Iftranscripts_table_dropis set toTrue, the function will drop the existing transcripts table if it exists, defaults to True - param: The
paramparameter in thecreate_transcript_viewfunction is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns
The
create_transcript_viewfunction returns the name of the transcripts table that was created or modified during the execution of the function.
10095 def annotation_format_to_table( 10096 self, 10097 uniquify: bool = True, 10098 annotation_field: str = "ANN", 10099 annotation_id: str = "Feature_ID", 10100 view_name: str = "transcripts", 10101 ) -> str: 10102 """ 10103 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 10104 table format. 10105 10106 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 10107 values in the output or not. If set to `True`, the function will make sure that the output values 10108 are unique, defaults to True 10109 :type uniquify: bool (optional) 10110 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 10111 contains the annotation information for each variant. This field is used to extract the annotation 10112 details for further processing in the function, defaults to ANN 10113 :type annotation_field: str (optional) 10114 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 10115 used to specify the identifier for the annotation feature. This identifier will be used as a column 10116 name in the resulting table or view that is created based on the annotation data. It helps in 10117 uniquely identifying each annotation entry in the, defaults to Feature_ID 10118 :type annotation_id: str (optional) 10119 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 10120 specify the name of the temporary table that will be created to store the transformed annotation 10121 data. This table will hold the extracted information from the annotation field in a structured 10122 format for further processing or analysis, defaults to transcripts 10123 :type view_name: str (optional) 10124 :return: The function `annotation_format_to_table` is returning the name of the view created, which 10125 is stored in the variable `view_name`. 10126 """ 10127 10128 # Annotation field 10129 annotation_format = "annotation_explode" 10130 10131 # Transcript annotation 10132 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 10133 10134 # Prefix 10135 prefix = self.get_explode_infos_prefix() 10136 if prefix: 10137 prefix = "INFO/" 10138 10139 # Annotation fields 10140 annotation_infos = prefix + annotation_field 10141 annotation_format_infos = prefix + annotation_format 10142 10143 # Variants table 10144 table_variants = self.get_table_variants() 10145 10146 # Header 10147 vcf_reader = self.get_header() 10148 10149 # Add columns 10150 added_columns = [] 10151 10152 # Explode HGVS field in column 10153 added_columns += self.explode_infos(fields=[annotation_field]) 10154 10155 if annotation_field in vcf_reader.infos: 10156 10157 # Extract ANN header 10158 ann_description = vcf_reader.infos[annotation_field].desc 10159 pattern = r"'(.+?)'" 10160 match = re.search(pattern, ann_description) 10161 if match: 10162 ann_header_match = match.group(1).split(" | ") 10163 ann_header = [] 10164 ann_header_desc = {} 10165 for i in range(len(ann_header_match)): 10166 ann_header_info = "".join( 10167 char for char in ann_header_match[i] if char.isalnum() 10168 ) 10169 ann_header.append(ann_header_info) 10170 ann_header_desc[ann_header_info] = ann_header_match[i] 10171 if not ann_header_desc: 10172 raise ValueError("Invalid header description format") 10173 else: 10174 raise ValueError("Invalid header description format") 10175 10176 # Create variant id 10177 variant_id_column = self.get_variant_id_column() 10178 added_columns += [variant_id_column] 10179 10180 # Create dataframe 10181 dataframe_annotation_format = self.get_query_to_df( 10182 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 10183 ) 10184 10185 # Create annotation columns 10186 dataframe_annotation_format[ 10187 annotation_format_infos 10188 ] = dataframe_annotation_format[annotation_infos].apply( 10189 lambda x: explode_annotation_format( 10190 annotation=str(x), 10191 uniquify=uniquify, 10192 output_format="JSON", 10193 prefix="", 10194 header=list(ann_header_desc.values()), 10195 ) 10196 ) 10197 10198 # Find keys 10199 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 10200 df_keys = self.get_query_to_df(query=query_json) 10201 10202 # Check keys 10203 query_json_key = [] 10204 for _, row in df_keys.iterrows(): 10205 10206 # Key 10207 key = row.iloc[0] 10208 10209 # key_clean 10210 key_clean = "".join(char for char in key if char.isalnum()) 10211 10212 # Type 10213 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 10214 10215 # Get DataFrame from query 10216 df_json_type = self.get_query_to_df(query=query_json_type) 10217 10218 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 10219 with pd.option_context("future.no_silent_downcasting", True): 10220 df_json_type.fillna(value="", inplace=True) 10221 replace_dict = {None: np.nan, "": np.nan} 10222 df_json_type.replace(replace_dict, inplace=True) 10223 df_json_type.dropna(inplace=True) 10224 10225 # Detect column type 10226 column_type = detect_column_type(df_json_type[key_clean]) 10227 10228 # Append 10229 query_json_key.append( 10230 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 10231 ) 10232 10233 # Create view 10234 query_view = f""" 10235 CREATE TEMPORARY TABLE {view_name} 10236 AS ( 10237 SELECT *, {annotation_id} AS 'transcript' 10238 FROM ( 10239 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 10240 FROM dataframe_annotation_format 10241 ) 10242 ); 10243 """ 10244 self.execute_query(query=query_view) 10245 10246 else: 10247 10248 # Return None 10249 view_name = None 10250 10251 # Remove added columns 10252 for added_column in added_columns: 10253 self.drop_column(column=added_column) 10254 10255 return view_name
The function annotation_format_to_table converts annotation data from a VCF file into a structured
table format.
Parameters
- uniquify: The
uniquifyparameter is a boolean flag that determines whether to ensure unique values in the output or not. If set toTrue, the function will make sure that the output values are unique, defaults to True - annotation_field: The
annotation_fieldparameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function, defaults to ANN - annotation_id: The
annotation_idparameter in theannotation_format_to_tablemethod is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID - view_name: The
view_nameparameter in theannotation_format_to_tablemethod is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis, defaults to transcripts
Returns
The function
annotation_format_to_tableis returning the name of the view created, which is stored in the variableview_name.
10257 def transcript_view_to_variants( 10258 self, 10259 transcripts_table: str = None, 10260 transcripts_column_id: str = None, 10261 transcripts_info_json: str = None, 10262 transcripts_info_field_json: str = None, 10263 transcripts_info_format: str = None, 10264 transcripts_info_field_format: str = None, 10265 param: dict = {}, 10266 ) -> bool: 10267 """ 10268 The `transcript_view_to_variants` function updates a variants table with information from 10269 transcripts in JSON format. 10270 10271 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 10272 table containing the transcripts data. If this parameter is not provided, the function will 10273 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 10274 :type transcripts_table: str 10275 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 10276 column in the `transcripts_table` that contains the unique identifier for each transcript. This 10277 identifier is used to match transcripts with variants in the database 10278 :type transcripts_column_id: str 10279 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 10280 of the column in the variants table where the transcripts information will be stored in JSON 10281 format. This parameter allows you to define the column in the variants table that will hold the 10282 JSON-formatted information about transcripts 10283 :type transcripts_info_json: str 10284 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 10285 specify the field in the VCF header that will contain information about transcripts in JSON 10286 format. This field will be added to the VCF header as an INFO field with the specified name 10287 :type transcripts_info_field_json: str 10288 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 10289 format of the information about transcripts that will be stored in the variants table. This 10290 format can be used to define how the transcript information will be structured or displayed 10291 within the variants table 10292 :type transcripts_info_format: str 10293 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 10294 specify the field in the VCF header that will contain information about transcripts in a 10295 specific format. This field will be added to the VCF header as an INFO field with the specified 10296 name 10297 :type transcripts_info_field_format: str 10298 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 10299 that contains various configuration settings related to transcripts. It is used to provide 10300 default values for certain parameters if they are not explicitly provided when calling the 10301 method. The `param` dictionary can be passed as an argument 10302 :type param: dict 10303 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 10304 if the operation is successful and `False` if certain conditions are not met. 10305 """ 10306 10307 msg_info_prefix = "Start transcripts view to variants annotations" 10308 10309 log.debug(f"{msg_info_prefix}...") 10310 10311 # Default 10312 transcripts_table_default = "transcripts" 10313 transcripts_column_id_default = "transcript" 10314 transcripts_info_json_default = None 10315 transcripts_info_format_default = None 10316 transcripts_info_field_json_default = None 10317 transcripts_info_field_format_default = None 10318 10319 # Param 10320 if not param: 10321 param = self.get_param() 10322 10323 # Transcripts table 10324 if transcripts_table is None: 10325 transcripts_table = param.get("transcripts", {}).get( 10326 "table", transcripts_table_default 10327 ) 10328 10329 # Transcripts column ID 10330 if transcripts_column_id is None: 10331 transcripts_column_id = param.get("transcripts", {}).get( 10332 "column_id", transcripts_column_id_default 10333 ) 10334 10335 # Transcripts info json 10336 if transcripts_info_json is None: 10337 transcripts_info_json = param.get("transcripts", {}).get( 10338 "transcripts_info_json", transcripts_info_json_default 10339 ) 10340 10341 # Transcripts info field JSON 10342 if transcripts_info_field_json is None: 10343 transcripts_info_field_json = param.get("transcripts", {}).get( 10344 "transcripts_info_field_json", transcripts_info_field_json_default 10345 ) 10346 # if transcripts_info_field_json is not None and transcripts_info_json is None: 10347 # transcripts_info_json = transcripts_info_field_json 10348 10349 # Transcripts info format 10350 if transcripts_info_format is None: 10351 transcripts_info_format = param.get("transcripts", {}).get( 10352 "transcripts_info_format", transcripts_info_format_default 10353 ) 10354 10355 # Transcripts info field FORMAT 10356 if transcripts_info_field_format is None: 10357 transcripts_info_field_format = param.get("transcripts", {}).get( 10358 "transcripts_info_field_format", transcripts_info_field_format_default 10359 ) 10360 # if ( 10361 # transcripts_info_field_format is not None 10362 # and transcripts_info_format is None 10363 # ): 10364 # transcripts_info_format = transcripts_info_field_format 10365 10366 # Variants table 10367 table_variants = self.get_table_variants() 10368 10369 # Check info columns param 10370 if ( 10371 transcripts_info_json is None 10372 and transcripts_info_field_json is None 10373 and transcripts_info_format is None 10374 and transcripts_info_field_format is None 10375 ): 10376 return False 10377 10378 # Transcripts infos columns 10379 query_transcripts_infos_columns = f""" 10380 SELECT * 10381 FROM ( 10382 DESCRIBE SELECT * FROM {transcripts_table} 10383 ) 10384 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10385 """ 10386 transcripts_infos_columns = list( 10387 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10388 ) 10389 10390 # View results 10391 clause_select = [] 10392 clause_to_json = [] 10393 clause_to_format = [] 10394 for field in transcripts_infos_columns: 10395 clause_select.append( 10396 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10397 ) 10398 clause_to_json.append(f""" '{field}': "{field}" """) 10399 clause_to_format.append(f""" "{field}" """) 10400 10401 # Update 10402 update_set_json = [] 10403 update_set_format = [] 10404 10405 # VCF header 10406 vcf_reader = self.get_header() 10407 10408 # Transcripts to info column in JSON 10409 if transcripts_info_json is not None: 10410 10411 # Create column on variants table 10412 self.add_column( 10413 table_name=table_variants, 10414 column_name=transcripts_info_json, 10415 column_type="JSON", 10416 default_value=None, 10417 drop=False, 10418 ) 10419 10420 # Add header 10421 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10422 transcripts_info_json, 10423 ".", 10424 "String", 10425 "Transcripts in JSON format", 10426 "unknwon", 10427 "unknwon", 10428 self.code_type_map["String"], 10429 ) 10430 10431 # Add to update 10432 update_set_json.append( 10433 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10434 ) 10435 10436 # Transcripts to info field in JSON 10437 if transcripts_info_field_json is not None: 10438 10439 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 10440 10441 # Add to update 10442 update_set_json.append( 10443 f""" 10444 INFO = concat( 10445 CASE 10446 WHEN INFO NOT IN ('', '.') 10447 THEN INFO 10448 ELSE '' 10449 END, 10450 CASE 10451 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10452 THEN concat( 10453 ';{transcripts_info_field_json}=', 10454 t.{transcripts_info_json} 10455 ) 10456 ELSE '' 10457 END 10458 ) 10459 """ 10460 ) 10461 10462 # Add header 10463 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 10464 transcripts_info_field_json, 10465 ".", 10466 "String", 10467 "Transcripts in JSON format", 10468 "unknwon", 10469 "unknwon", 10470 self.code_type_map["String"], 10471 ) 10472 10473 if update_set_json: 10474 10475 # Update query 10476 query_update = f""" 10477 UPDATE {table_variants} 10478 SET {", ".join(update_set_json)} 10479 FROM 10480 ( 10481 SELECT 10482 "#CHROM", POS, REF, ALT, 10483 concat( 10484 '{{', 10485 string_agg( 10486 '"' || "{transcripts_column_id}" || '":' || 10487 to_json(json_output) 10488 ), 10489 '}}' 10490 )::JSON AS {transcripts_info_json} 10491 FROM 10492 ( 10493 SELECT 10494 "#CHROM", POS, REF, ALT, 10495 "{transcripts_column_id}", 10496 to_json( 10497 {{{",".join(clause_to_json)}}} 10498 )::JSON AS json_output 10499 FROM 10500 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10501 WHERE "{transcripts_column_id}" IS NOT NULL 10502 ) 10503 GROUP BY "#CHROM", POS, REF, ALT 10504 ) AS t 10505 WHERE {table_variants}."#CHROM" = t."#CHROM" 10506 AND {table_variants}."POS" = t."POS" 10507 AND {table_variants}."REF" = t."REF" 10508 AND {table_variants}."ALT" = t."ALT" 10509 """ 10510 10511 self.execute_query(query=query_update) 10512 10513 # Transcripts to info column in FORMAT 10514 if transcripts_info_format is not None: 10515 10516 # Create column on variants table 10517 self.add_column( 10518 table_name=table_variants, 10519 column_name=transcripts_info_format, 10520 column_type="VARCHAR", 10521 default_value=None, 10522 drop=False, 10523 ) 10524 10525 # Add header 10526 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 10527 transcripts_info_format, 10528 ".", 10529 "String", 10530 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10531 "unknwon", 10532 "unknwon", 10533 self.code_type_map["String"], 10534 ) 10535 10536 # Add to update 10537 update_set_format.append( 10538 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 10539 ) 10540 10541 # Transcripts to info field in JSON 10542 if transcripts_info_field_format is not None: 10543 10544 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 10545 10546 # Add to update 10547 update_set_format.append( 10548 f""" 10549 INFO = concat( 10550 CASE 10551 WHEN INFO NOT IN ('', '.') 10552 THEN INFO 10553 ELSE '' 10554 END, 10555 CASE 10556 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 10557 THEN concat( 10558 ';{transcripts_info_field_format}=', 10559 t.{transcripts_info_format} 10560 ) 10561 ELSE '' 10562 END 10563 ) 10564 """ 10565 ) 10566 10567 # Add header 10568 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 10569 transcripts_info_field_format, 10570 ".", 10571 "String", 10572 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10573 "unknwon", 10574 "unknwon", 10575 self.code_type_map["String"], 10576 ) 10577 10578 if update_set_format: 10579 10580 # Update query 10581 query_update = f""" 10582 UPDATE {table_variants} 10583 SET {", ".join(update_set_format)} 10584 FROM 10585 ( 10586 SELECT 10587 "#CHROM", POS, REF, ALT, 10588 string_agg({transcripts_info_format}) AS {transcripts_info_format} 10589 FROM 10590 ( 10591 SELECT 10592 "#CHROM", POS, REF, ALT, 10593 "{transcripts_column_id}", 10594 concat( 10595 "{transcripts_column_id}", 10596 '|', 10597 {", '|', ".join(clause_to_format)} 10598 ) AS {transcripts_info_format} 10599 FROM 10600 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10601 ) 10602 GROUP BY "#CHROM", POS, REF, ALT 10603 ) AS t 10604 WHERE {table_variants}."#CHROM" = t."#CHROM" 10605 AND {table_variants}."POS" = t."POS" 10606 AND {table_variants}."REF" = t."REF" 10607 AND {table_variants}."ALT" = t."ALT" 10608 """ 10609 10610 self.execute_query(query=query_update) 10611 10612 return True
The transcript_view_to_variants function updates a variants table with information from
transcripts in JSON format.
Parameters
- transcripts_table: The
transcripts_tableparameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from theparamdictionary or use a default value of "transcripts" - transcripts_column_id: The
transcripts_column_idparameter is used to specify the column in thetranscripts_tablethat contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database - transcripts_info_json: The
transcripts_info_jsonparameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts - transcripts_info_field_json: The
transcripts_info_field_jsonparameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name - transcripts_info_format: The
transcripts_info_formatparameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table - transcripts_info_field_format: The
transcripts_info_field_formatparameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name - param: The
paramparameter in thetranscript_view_to_variantsmethod is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. Theparamdictionary can be passed as an argument
Returns
The function
transcript_view_to_variantsreturns a boolean value. It returnsTrueif the operation is successful andFalseif certain conditions are not met.